Update converter API, user streams rather than file paths (#1088)

* Updated DocumentConverter interface
* Updated all DocumentConverter classes
* Added support for various new audio files.
* Updated sample plugin to new DocumentConverter interface.
* Updated project README with notes about changes, and use-cases.
* Updated DocumentConverter documentation.
* Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple.

---------

Co-authored-by: Kenny Zhang <kzhang678@gmail.com>
This commit is contained in:
afourney
2025-03-05 21:16:55 -08:00
committed by GitHub
parent 1d2f231146
commit e921497f79
47 changed files with 2329 additions and 1264 deletions

View File

@@ -10,23 +10,38 @@ This project shows how to create a sample plugin for MarkItDown. The most import
Next, implement your custom DocumentConverter:
```python
from typing import Union
from markitdown import DocumentConverter, DocumentConverterResult
from typing import BinaryIO, Any
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
class RtfConverter(DocumentConverter):
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an RTF file
extension = kwargs.get("file_extension", "")
if extension.lower() != ".rtf":
return None
# Implement the conversion logic here ...
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
# Return the result
return DocumentConverterResult(
title=title,
text_content=text_content,
)
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
# Implement logic to check if the file stream is an RTF file
# ...
raise NotImplementedError()
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
# Implement logic to convert the file stream to Markdown
# ...
raise NotImplementedError()
```
Next, make sure your package implements and exports the following:
@@ -71,10 +86,10 @@ Once the plugin package is installed, verify that it is available to MarkItDown
markitdown --list-plugins
```
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert a PDF:
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file:
```bash
markitdown --use-plugins path-to-file.pdf
markitdown --use-plugins path-to-file.rtf
```
In Python, plugins can be enabled as follows:
@@ -83,7 +98,7 @@ In Python, plugins can be enabled as follows:
from markitdown import MarkItDown
md = MarkItDown(enable_plugins=True)
result = md.convert("path-to-file.pdf")
result = md.convert("path-to-file.rtf")
print(result.text_content)
```

View File

@@ -24,7 +24,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"markitdown",
"markitdown>=0.0.2a2",
"striprtf",
]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1a2"
__version__ = "0.0.1a3"

View File

@@ -1,12 +1,26 @@
from typing import Union
import locale
from typing import BinaryIO, Any
from striprtf.striprtf import rtf_to_text
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
from markitdown import (
MarkItDown,
DocumentConverter,
DocumentConverterResult,
StreamInfo,
)
__plugin_interface_version__ = (
1 # The version of the plugin interface that this plugin uses
)
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/rtf",
"application/rtf",
]
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
def register_converters(markitdown: MarkItDown, **kwargs):
"""
@@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter):
Converts an RTF file to in the simplest possible way.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a RTF
extension = kwargs.get("file_extension", "")
if extension.lower() != ".rtf":
return None
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
# Read the RTF file
with open(local_path, "r") as f:
rtf = f.read()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
# Read the file stream into an str using hte provided charset encoding, or using the system default
encoding = stream_info.charset or locale.getpreferredencoding()
stream_data = file_stream.read().decode(encoding)
# Return the result
return DocumentConverterResult(
title=None,
text_content=rtf_to_text(rtf),
markdown=rtf_to_text(stream_data),
)

View File

@@ -2,7 +2,7 @@
import os
import pytest
from markitdown import MarkItDown
from markitdown import MarkItDown, StreamInfo
from markitdown_sample_plugin import RtfConverter
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -15,18 +15,22 @@ RTF_TEST_STRINGS = {
def test_converter() -> None:
"""Tests the RTF converter dirctly."""
converter = RtfConverter()
result = converter.convert(
os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf"
)
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
converter = RtfConverter()
result = converter.convert(
file_stream=file_stream,
stream_info=StreamInfo(
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
),
)
for test_string in RTF_TEST_STRINGS:
assert test_string in result.text_content
for test_string in RTF_TEST_STRINGS:
assert test_string in result.text_content
def test_markitdown() -> None:
"""Tests that MarkItDown correctly loads the plugin."""
md = MarkItDown()
md = MarkItDown(enable_plugins=True)
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
for test_string in RTF_TEST_STRINGS: