173 lines
5.3 KiB
Python
173 lines
5.3 KiB
Python
#!/usr/bin/env python3 -m pytest
|
|
import os
|
|
import time
|
|
import pytest
|
|
import subprocess
|
|
import locale
|
|
from typing import List
|
|
|
|
if __name__ == "__main__":
|
|
from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
|
|
else:
|
|
from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
|
|
|
|
from markitdown import (
|
|
MarkItDown,
|
|
UnsupportedFormatException,
|
|
FileConversionException,
|
|
StreamInfo,
|
|
)
|
|
|
|
skip_remote = (
|
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
|
) # Don't run these tests in CI
|
|
|
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
|
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
|
|
|
|
|
|
# Prepare CLI test vectors (remove vectors that require mockig the url)
|
|
CLI_TEST_VECTORS: List[FileTestVector] = []
|
|
for test_vector in GENERAL_TEST_VECTORS:
|
|
if test_vector.url is not None:
|
|
continue
|
|
CLI_TEST_VECTORS.append(test_vector)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def shared_tmp_dir(tmp_path_factory):
|
|
return tmp_path_factory.mktemp("pytest_tmp")
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
|
def test_output_to_stdout(shared_tmp_dir, test_vector) -> None:
|
|
"""Test that the CLI outputs to stdout correctly."""
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"python",
|
|
"-m",
|
|
"markitdown",
|
|
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
|
for test_string in test_vector.must_include:
|
|
assert test_string in result.stdout
|
|
for test_string in test_vector.must_not_include:
|
|
assert test_string not in result.stdout
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
|
def test_output_to_file(shared_tmp_dir, test_vector) -> None:
|
|
"""Test that the CLI outputs to a file correctly."""
|
|
|
|
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
|
|
result = subprocess.run(
|
|
[
|
|
"python",
|
|
"-m",
|
|
"markitdown",
|
|
"-o",
|
|
output_file,
|
|
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
|
assert os.path.exists(output_file), f"Output file not created: {output_file}"
|
|
|
|
with open(output_file, "r") as f:
|
|
output_data = f.read()
|
|
for test_string in test_vector.must_include:
|
|
assert test_string in output_data
|
|
for test_string in test_vector.must_not_include:
|
|
assert test_string not in output_data
|
|
|
|
os.remove(output_file)
|
|
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
|
|
|
|
|
|
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
|
def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
|
|
"""Test that the CLI readds from stdin correctly."""
|
|
|
|
test_input = b""
|
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
|
test_input = stream.read()
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"python",
|
|
"-m",
|
|
"markitdown",
|
|
os.path.join(TEST_FILES_DIR, test_vector.filename),
|
|
],
|
|
input=test_input,
|
|
capture_output=True,
|
|
text=False,
|
|
)
|
|
|
|
stdout = result.stdout.decode(locale.getpreferredencoding())
|
|
assert (
|
|
result.returncode == 0
|
|
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
|
|
for test_string in test_vector.must_include:
|
|
assert test_string in stdout
|
|
for test_string in test_vector.must_not_include:
|
|
assert test_string not in stdout
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
skip_remote,
|
|
reason="do not run tests that query external urls",
|
|
)
|
|
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
|
|
def test_convert_url(shared_tmp_dir, test_vector):
|
|
"""Test the conversion of a stream with no stream info."""
|
|
# Note: tmp_dir is not used here, but is needed to match the signature
|
|
|
|
markitdown = MarkItDown()
|
|
|
|
time.sleep(1) # Ensure we don't hit rate limits
|
|
result = subprocess.run(
|
|
["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
|
|
capture_output=True,
|
|
text=False,
|
|
)
|
|
|
|
stdout = result.stdout.decode(locale.getpreferredencoding())
|
|
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
|
|
for test_string in test_vector.must_include:
|
|
assert test_string in stdout
|
|
for test_string in test_vector.must_not_include:
|
|
assert test_string not in stdout
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
import tempfile
|
|
|
|
"""Runs this file's tests from the command line."""
|
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
for test_function in [
|
|
test_output_to_stdout,
|
|
test_output_to_file,
|
|
test_input_from_stdin_without_hints,
|
|
test_convert_url,
|
|
]:
|
|
for test_vector in CLI_TEST_VECTORS:
|
|
print(
|
|
f"Running {test_function.__name__} on {test_vector.filename}...",
|
|
end="",
|
|
)
|
|
test_function(tmp_dir, test_vector)
|
|
print("OK")
|
|
print("All tests passed!")
|