test/packages/markitdown/tests/test_cli_vectors.py

#!/usr/bin/env python3 -m pytest
import os
import time
import pytest
import subprocess
import locale
from typing import List

if __name__ == "__main__":
    from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
else:
    from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector

from markitdown import (
    MarkItDown,
    UnsupportedFormatException,
    FileConversionException,
    StreamInfo,
)

skip_remote = (
    True if os.environ.get("GITHUB_ACTIONS") else False
)  # Don't run these tests in CI

TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"


# Prepare CLI test vectors (remove vectors that require mockig the url)
CLI_TEST_VECTORS: List[FileTestVector] = []
for test_vector in GENERAL_TEST_VECTORS:
    if test_vector.url is not None:
        continue
    CLI_TEST_VECTORS.append(test_vector)


@pytest.fixture(scope="session")
def shared_tmp_dir(tmp_path_factory):
    return tmp_path_factory.mktemp("pytest_tmp")


@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_output_to_stdout(shared_tmp_dir, test_vector) -> None:
    """Test that the CLI outputs to stdout correctly."""

    result = subprocess.run(
        [
            "python",
            "-m",
            "markitdown",
            os.path.join(TEST_FILES_DIR, test_vector.filename),
        ],
        capture_output=True,
        text=True,
    )

    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
    for test_string in test_vector.must_include:
        assert test_string in result.stdout
    for test_string in test_vector.must_not_include:
        assert test_string not in result.stdout


@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_output_to_file(shared_tmp_dir, test_vector) -> None:
    """Test that the CLI outputs to a file correctly."""

    output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
    result = subprocess.run(
        [
            "python",
            "-m",
            "markitdown",
            "-o",
            output_file,
            os.path.join(TEST_FILES_DIR, test_vector.filename),
        ],
        capture_output=True,
        text=True,
    )

    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
    assert os.path.exists(output_file), f"Output file not created: {output_file}"

    with open(output_file, "r") as f:
        output_data = f.read()
        for test_string in test_vector.must_include:
            assert test_string in output_data
        for test_string in test_vector.must_not_include:
            assert test_string not in output_data

    os.remove(output_file)
    assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"


@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
    """Test that the CLI readds from stdin correctly."""

    test_input = b""
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        test_input = stream.read()

    result = subprocess.run(
        [
            "python",
            "-m",
            "markitdown",
            os.path.join(TEST_FILES_DIR, test_vector.filename),
        ],
        input=test_input,
        capture_output=True,
        text=False,
    )

    stdout = result.stdout.decode(locale.getpreferredencoding())
    assert (
        result.returncode == 0
    ), f"CLI exited with error: {result.stderr.decode('utf-8')}"
    for test_string in test_vector.must_include:
        assert test_string in stdout
    for test_string in test_vector.must_not_include:
        assert test_string not in stdout


@pytest.mark.skipif(
    skip_remote,
    reason="do not run tests that query external urls",
)
@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
def test_convert_url(shared_tmp_dir, test_vector):
    """Test the conversion of a stream with no stream info."""
    # Note: tmp_dir is not used here, but is needed to match the signature

    markitdown = MarkItDown()

    time.sleep(1)  # Ensure we don't hit rate limits
    result = subprocess.run(
        ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
        capture_output=True,
        text=False,
    )

    stdout = result.stdout.decode(locale.getpreferredencoding())
    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
    for test_string in test_vector.must_include:
        assert test_string in stdout
    for test_string in test_vector.must_not_include:
        assert test_string not in stdout


if __name__ == "__main__":
    import sys
    import tempfile

    """Runs this file's tests from the command line."""

    with tempfile.TemporaryDirectory() as tmp_dir:
        for test_function in [
            test_output_to_stdout,
            test_output_to_file,
            test_input_from_stdin_without_hints,
            test_convert_url,
        ]:
            for test_vector in CLI_TEST_VECTORS:
                print(
                    f"Running {test_function.__name__} on {test_vector.filename}...",
                    end="",
                )
                test_function(tmp_dir, test_vector)
                print("OK")
    print("All tests passed!")