Files
test/packages/markitdown/tests/test_cli_vectors.py
t3tra cb421cf9ea Chore: Make linter happy (#1256)
* refactor: remove unused imports

* fix: replace NotImplemented with NotImplementedError

* refactor: resolve E722 (do not use bare 'except')

* refactor: remove unused variable

* refactor: remove unused imports

* refactor: ignore unused imports that will be used in the future

* refactor: resolve W293 (blank line contains whitespace)

* refactor: resolve F541 (f-string is missing placeholders)

---------

Co-authored-by: afourney <adamfo@microsoft.com>
2025-05-21 10:02:16 -07:00

218 lines
6.8 KiB
Python

#!/usr/bin/env python3 -m pytest
import os
import time
import pytest
import subprocess
import locale
from typing import List
if __name__ == "__main__":
from _test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
FileTestVector,
)
else:
from ._test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
FileTestVector,
)
skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False
) # Don't run these tests in CI
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
# Prepare CLI test vectors (remove vectors that require mockig the url)
CLI_TEST_VECTORS: List[FileTestVector] = []
for test_vector in GENERAL_TEST_VECTORS:
if test_vector.url is not None:
continue
CLI_TEST_VECTORS.append(test_vector)
@pytest.fixture(scope="session")
def shared_tmp_dir(tmp_path_factory):
return tmp_path_factory.mktemp("pytest_tmp")
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_output_to_stdout(shared_tmp_dir, test_vector) -> None:
"""Test that the CLI outputs to stdout correctly."""
result = subprocess.run(
[
"python",
"-m",
"markitdown",
os.path.join(TEST_FILES_DIR, test_vector.filename),
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
for test_string in test_vector.must_include:
assert test_string in result.stdout
for test_string in test_vector.must_not_include:
assert test_string not in result.stdout
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_output_to_file(shared_tmp_dir, test_vector) -> None:
"""Test that the CLI outputs to a file correctly."""
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
result = subprocess.run(
[
"python",
"-m",
"markitdown",
"-o",
output_file,
os.path.join(TEST_FILES_DIR, test_vector.filename),
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
assert os.path.exists(output_file), f"Output file not created: {output_file}"
with open(output_file, "r") as f:
output_data = f.read()
for test_string in test_vector.must_include:
assert test_string in output_data
for test_string in test_vector.must_not_include:
assert test_string not in output_data
os.remove(output_file)
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
"""Test that the CLI readds from stdin correctly."""
test_input = b""
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
test_input = stream.read()
result = subprocess.run(
[
"python",
"-m",
"markitdown",
os.path.join(TEST_FILES_DIR, test_vector.filename),
],
input=test_input,
capture_output=True,
text=False,
)
stdout = result.stdout.decode(locale.getpreferredencoding())
assert (
result.returncode == 0
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
for test_string in test_vector.must_include:
assert test_string in stdout
for test_string in test_vector.must_not_include:
assert test_string not in stdout
@pytest.mark.skipif(
skip_remote,
reason="do not run tests that query external urls",
)
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_convert_url(shared_tmp_dir, test_vector):
"""Test the conversion of a stream with no stream info."""
# Note: tmp_dir is not used here, but is needed to match the signature
time.sleep(1) # Ensure we don't hit rate limits
result = subprocess.run(
["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
capture_output=True,
text=False,
)
stdout = result.stdout.decode(locale.getpreferredencoding())
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
for test_string in test_vector.must_include:
assert test_string in stdout
for test_string in test_vector.must_not_include:
assert test_string not in stdout
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
"""Test CLI functionality when keep_data_uris is enabled"""
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
result = subprocess.run(
[
"python",
"-m",
"markitdown",
"--keep-data-uris",
"-o",
output_file,
os.path.join(TEST_FILES_DIR, test_vector.filename),
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
assert os.path.exists(output_file), f"Output file not created: {output_file}"
with open(output_file, "r") as f:
output_data = f.read()
for test_string in test_vector.must_include:
assert test_string in output_data
for test_string in test_vector.must_not_include:
assert test_string not in output_data
os.remove(output_file)
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
if __name__ == "__main__":
import tempfile
"""Runs this file's tests from the command line."""
with tempfile.TemporaryDirectory() as tmp_dir:
# General tests
for test_function in [
test_output_to_stdout,
test_output_to_file,
test_input_from_stdin_without_hints,
test_convert_url,
]:
for test_vector in CLI_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...",
end="",
)
test_function(tmp_dir, test_vector)
print("OK")
# Data URI tests
for test_function in [
test_output_to_file_with_data_uris,
]:
for test_vector in DATA_URI_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...",
end="",
)
test_function(tmp_dir, test_vector)
print("OK")
print("All tests passed!")