Add support for preserving base64 encoded images (#1140)

* optional reserve base64 string in markdown _CustomMarkdownify and pptx
* add other converter para support
* fix linter
* Use *kwarg to pass keep_data_uri para.
* Add module cli vector tests
* Fixed formatting, and adjusted tests.
This commit is contained in:
Yuzhong Zhang
2025-03-21 09:50:23 +08:00
committed by GitHub
parent c0a511ecff
commit 52432bd228
12 changed files with 214 additions and 26 deletions

View File

@@ -6,9 +6,9 @@ import codecs
if __name__ == "__main__":
from _test_vectors import GENERAL_TEST_VECTORS
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
else:
from ._test_vectors import GENERAL_TEST_VECTORS
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
from markitdown import (
MarkItDown,
@@ -124,10 +124,52 @@ def test_convert_url(test_vector):
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_with_data_uris(test_vector):
"""Test API functionality when keep_data_uris is enabled"""
markitdown = MarkItDown()
# Test local file conversion
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, test_vector.filename),
keep_data_uris=True,
url=test_vector.url,
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_stream_with_data_uris(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()
stream_info = StreamInfo(
extension=os.path.splitext(test_vector.filename)[1],
mimetype=test_vector.mimetype,
charset=test_vector.charset,
)
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(
stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
if __name__ == "__main__":
import sys
"""Runs this file's tests from the command line."""
# General tests
for test_function in [
test_guess_stream_info,
test_convert_local,
@@ -141,4 +183,17 @@ if __name__ == "__main__":
)
test_function(test_vector)
print("OK")
# Data URI tests
for test_function in [
test_convert_with_data_uris,
test_convert_stream_with_data_uris,
]:
for test_vector in DATA_URI_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
)
test_function(test_vector)
print("OK")
print("All tests passed!")