* Refactored tests. * Fixed CI errors, and included misc tests. * Omit mskanji from streaminfo test. * Omit mskanji from no hints test. * Log results of debugging in comments (linked to Magika issue) * Added docs as to when to use misc tests.
155 lines
5.0 KiB
Python
155 lines
5.0 KiB
Python
#!/usr/bin/env python3 -m pytest
|
|
import os
|
|
import time
|
|
import pytest
|
|
import codecs
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from _test_vectors import GENERAL_TEST_VECTORS
|
|
else:
|
|
from ._test_vectors import GENERAL_TEST_VECTORS
|
|
|
|
from markitdown import (
|
|
MarkItDown,
|
|
UnsupportedFormatException,
|
|
FileConversionException,
|
|
StreamInfo,
|
|
)
|
|
|
|
skip_remote = (
|
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
|
) # Don't run these tests in CI
|
|
|
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
|
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_guess_stream_info(test_vector):
|
|
"""Test the ability to guess stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
local_path = os.path.join(TEST_FILES_DIR, test_vector.filename)
|
|
expected_extension = os.path.splitext(test_vector.filename)[1]
|
|
|
|
with open(local_path, "rb") as stream:
|
|
guesses = markitdown._get_stream_info_guesses(
|
|
stream,
|
|
base_guess=StreamInfo(
|
|
filename=os.path.basename(test_vector.filename),
|
|
local_path=local_path,
|
|
extension=expected_extension,
|
|
),
|
|
)
|
|
|
|
# For some limited exceptions, we can't guarantee the exact
|
|
# mimetype or extension, so we'll special-case them here.
|
|
if test_vector.filename in [
|
|
"test_outlook_msg.msg",
|
|
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
|
|
]:
|
|
return
|
|
|
|
assert guesses[0].mimetype == test_vector.mimetype
|
|
assert guesses[0].extension == expected_extension
|
|
assert guesses[0].charset == test_vector.charset
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_local(test_vector):
|
|
"""Test the conversion of a local file."""
|
|
markitdown = MarkItDown()
|
|
|
|
result = markitdown.convert(
|
|
os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url
|
|
)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_stream_with_hints(test_vector):
|
|
"""Test the conversion of a stream with full stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
stream_info = StreamInfo(
|
|
extension=os.path.splitext(test_vector.filename)[1],
|
|
mimetype=test_vector.mimetype,
|
|
charset=test_vector.charset,
|
|
)
|
|
|
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
result = markitdown.convert(
|
|
stream, stream_info=stream_info, url=test_vector.url
|
|
)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_stream_without_hints(test_vector):
|
|
"""Test the conversion of a stream with no stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
# For some limited exceptions, we can't guarantee the exact
|
|
# mimetype or extension, so we'll special-case them here.
|
|
if test_vector.filename in [
|
|
# This appears to be a subtle bug in magika.
|
|
# See: https://github.com/google/magika/issues/983
|
|
"test_mskanji.csv",
|
|
]:
|
|
return
|
|
|
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
result = markitdown.convert(stream, url=test_vector.url)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
skip_remote,
|
|
reason="do not run tests that query external urls",
|
|
)
|
|
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
|
|
def test_convert_url(test_vector):
|
|
"""Test the conversion of a stream with no stream info."""
|
|
markitdown = MarkItDown()
|
|
|
|
time.sleep(1) # Ensure we don't hit rate limits
|
|
|
|
result = markitdown.convert(
|
|
TEST_FILES_URL + "/" + test_vector.filename,
|
|
url=test_vector.url, # Mock where this file would be found
|
|
)
|
|
for string in test_vector.must_include:
|
|
assert string in result.markdown
|
|
for string in test_vector.must_not_include:
|
|
assert string not in result.markdown
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
"""Runs this file's tests from the command line."""
|
|
for test_function in [
|
|
test_guess_stream_info,
|
|
test_convert_local,
|
|
test_convert_stream_with_hints,
|
|
test_convert_stream_without_hints,
|
|
test_convert_url,
|
|
]:
|
|
for test_vector in GENERAL_TEST_VECTORS:
|
|
print(
|
|
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
|
|
)
|
|
test_function(test_vector)
|
|
print("OK")
|
|
print("All tests passed!")
|