* optional reserve base64 string in markdown _CustomMarkdownify and pptx * add other converter para support * fix linter * Use *kwarg to pass keep_data_uri para. * Add module cli vector tests * Fixed formatting, and adjusted tests.
200 lines
6.4 KiB
Python
200 lines
6.4 KiB
Python
#!/usr/bin/env python3 -m pytest
|
|
import os
|
|
import time
|
|
import pytest
|
|
import codecs
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
|
else:
|
|
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
|
|
|
|
from markitdown import (
|
|
MarkItDown,
|
|
UnsupportedFormatException,
|
|
FileConversionException,
|
|
StreamInfo,
|
|
)
|
|
|
|
skip_remote = (
|
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
|
) # Don't run these tests in CI
|
|
|
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
|
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_guess_stream_info(test_vector):
|
|
"""Test the ability to guess stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
local_path = os.path.join(TEST_FILES_DIR, test_vector.filename)
|
|
expected_extension = os.path.splitext(test_vector.filename)[1]
|
|
|
|
with open(local_path, "rb") as stream:
|
|
guesses = markitdown._get_stream_info_guesses(
|
|
stream,
|
|
base_guess=StreamInfo(
|
|
filename=os.path.basename(test_vector.filename),
|
|
local_path=local_path,
|
|
extension=expected_extension,
|
|
),
|
|
)
|
|
|
|
# For some limited exceptions, we can't guarantee the exact
|
|
# mimetype or extension, so we'll special-case them here.
|
|
if test_vector.filename in [
|
|
"test_outlook_msg.msg",
|
|
]:
|
|
return
|
|
|
|
assert guesses[0].mimetype == test_vector.mimetype
|
|
assert guesses[0].extension == expected_extension
|
|
assert guesses[0].charset == test_vector.charset
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_local(test_vector):
|
|
"""Test the conversion of a local file."""
|
|
markitdown = MarkItDown()
|
|
|
|
result = markitdown.convert(
|
|
os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url
|
|
)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_stream_with_hints(test_vector):
|
|
"""Test the conversion of a stream with full stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
stream_info = StreamInfo(
|
|
extension=os.path.splitext(test_vector.filename)[1],
|
|
mimetype=test_vector.mimetype,
|
|
charset=test_vector.charset,
|
|
)
|
|
|
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
result = markitdown.convert(
|
|
stream, stream_info=stream_info, url=test_vector.url
|
|
)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_stream_without_hints(test_vector):
|
|
"""Test the conversion of a stream with no stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
result = markitdown.convert(stream, url=test_vector.url)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
skip_remote,
|
|
reason="do not run tests that query external urls",
|
|
)
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_url(test_vector):
|
|
"""Test the conversion of a stream with no stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
time.sleep(1) # Ensure we don't hit rate limits
|
|
|
|
result = markitdown.convert(
|
|
TEST_FILES_URL + "/" + test_vector.filename,
|
|
url=test_vector.url, # Mock where this file would be found
|
|
)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
|
def test_convert_with_data_uris(test_vector):
|
|
"""Test API functionality when keep_data_uris is enabled"""
|
|
markitdown = MarkItDown()
|
|
|
|
# Test local file conversion
|
|
result = markitdown.convert(
|
|
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
|
keep_data_uris=True,
|
|
url=test_vector.url,
|
|
)
|
|
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
|
|
def test_convert_stream_with_data_uris(test_vector):
|
|
"""Test the conversion of a stream with no stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
stream_info = StreamInfo(
|
|
extension=os.path.splitext(test_vector.filename)[1],
|
|
mimetype=test_vector.mimetype,
|
|
charset=test_vector.charset,
|
|
)
|
|
|
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
result = markitdown.convert(
|
|
stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
|
|
)
|
|
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
"""Runs this file's tests from the command line."""
|
|
|
|
# General tests
|
|
for test_function in [
|
|
test_guess_stream_info,
|
|
test_convert_local,
|
|
test_convert_stream_with_hints,
|
|
test_convert_stream_without_hints,
|
|
test_convert_url,
|
|
]:
|
|
for test_vector in GENERAL_TEST_VECTORS:
|
|
print(
|
|
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
|
)
|
|
test_function(test_vector)
|
|
print("OK")
|
|
|
|
# Data URI tests
|
|
for test_function in [
|
|
test_convert_with_data_uris,
|
|
test_convert_stream_with_data_uris,
|
|
]:
|
|
for test_vector in DATA_URI_TEST_VECTORS:
|
|
print(
|
|
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
|
)
|
|
test_function(test_vector)
|
|
print("OK")
|
|
|
|
print("All tests passed!")
|