Update converter API, user streams rather than file paths (#1088)

* Updated DocumentConverter interface
* Updated all DocumentConverter classes
* Added support for various new audio files.
* Updated sample plugin to new DocumentConverter interface.
* Updated project README with notes about changes, and use-cases.
* Updated DocumentConverter documentation.
* Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple.

---------

Co-authored-by: Kenny Zhang <kzhang678@gmail.com>
This commit is contained in:
afourney
2025-03-05 21:16:55 -08:00
committed by GitHub
parent 1d2f231146
commit e921497f79
47 changed files with 2329 additions and 1264 deletions

View File

@@ -2,7 +2,7 @@
import os
import pytest
from markitdown import MarkItDown
from markitdown import MarkItDown, StreamInfo
from markitdown_sample_plugin import RtfConverter
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -15,18 +15,22 @@ RTF_TEST_STRINGS = {
def test_converter() -> None:
"""Tests the RTF converter dirctly."""
converter = RtfConverter()
result = converter.convert(
os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf"
)
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
converter = RtfConverter()
result = converter.convert(
file_stream=file_stream,
stream_info=StreamInfo(
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
),
)
for test_string in RTF_TEST_STRINGS:
assert test_string in result.text_content
for test_string in RTF_TEST_STRINGS:
assert test_string in result.text_content
def test_markitdown() -> None:
"""Tests that MarkItDown correctly loads the plugin."""
md = MarkItDown()
md = MarkItDown(enable_plugins=True)
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
for test_string in RTF_TEST_STRINGS: