Refactored tests. (#1120)

* Refactored tests. * Fixed CI errors, and included misc tests. * Omit mskanji from streaminfo test. * Omit mskanji from no hints test. * Log results of debugging in comments (linked to Magika issue) * Added docs as to when to use misc tests.
2025-03-12 11:08:06 -07:00
parent 75140a90e2
commit 5f75e16d20
7 changed files with 649 additions and 429 deletions
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3 -m pytest
+import os
+import time
+import pytest
+import subprocess
+import locale
+from typing import List
+
+if __name__ == "__main__":
+    from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+else:
+    from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+
+from markitdown import (
+    MarkItDown,
+    UnsupportedFormatException,
+    FileConversionException,
+    StreamInfo,
+)
+
+skip_remote = (
+    True if os.environ.get("GITHUB_ACTIONS") else False
+)  # Don't run these tests in CI
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files"
+
+
+# Prepare CLI test vectors (remove vectors that require mockig the url)
+CLI_TEST_VECTORS: List[FileTestVector] = []
+for test_vector in GENERAL_TEST_VECTORS:
+    if test_vector.url is not None:
+        continue
+    CLI_TEST_VECTORS.append(test_vector)
+
+
+@pytest.fixture(scope="session")
+def shared_tmp_dir(tmp_path_factory):
+    return tmp_path_factory.mktemp("pytest_tmp")
+
+
+@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
+def test_output_to_stdout(shared_tmp_dir, test_vector) -> None:
+    """Test that the CLI outputs to stdout correctly."""
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "markitdown",
+            os.path.join(TEST_FILES_DIR, test_vector.filename),
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+    for test_string in test_vector.must_include:
+        assert test_string in result.stdout
+    for test_string in test_vector.must_not_include:
+        assert test_string not in result.stdout
+
+
+@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
+def test_output_to_file(shared_tmp_dir, test_vector) -> None:
+    """Test that the CLI outputs to a file correctly."""
+
+    output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "markitdown",
+            "-o",
+            output_file,
+            os.path.join(TEST_FILES_DIR, test_vector.filename),
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+    assert os.path.exists(output_file), f"Output file not created: {output_file}"
+
+    with open(output_file, "r") as f:
+        output_data = f.read()
+        for test_string in test_vector.must_include:
+            assert test_string in output_data
+        for test_string in test_vector.must_not_include:
+            assert test_string not in output_data
+
+    os.remove(output_file)
+    assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
+
+
+@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
+def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
+    """Test that the CLI readds from stdin correctly."""
+
+    test_input = b""
+    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
+        test_input = stream.read()
+
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "markitdown",
+            os.path.join(TEST_FILES_DIR, test_vector.filename),
+        ],
+        input=test_input,
+        capture_output=True,
+        text=False,
+    )
+
+    stdout = result.stdout.decode(locale.getpreferredencoding())
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+    for test_string in test_vector.must_include:
+        assert test_string in stdout
+    for test_string in test_vector.must_not_include:
+        assert test_string not in stdout
+
+
+@pytest.mark.skipif(
+    skip_remote,
+    reason="do not run tests that query external urls",
+)
+@pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS)
+def test_convert_url(shared_tmp_dir, test_vector):
+    """Test the conversion of a stream with no stream info."""
+    # Note: tmp_dir is not used here, but is needed to match the signature
+
+    markitdown = MarkItDown()
+
+    time.sleep(1)  # Ensure we don't hit rate limits
+    result = subprocess.run(
+        ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
+        capture_output=True,
+        text=False,
+    )
+
+    stdout = result.stdout.decode(locale.getpreferredencoding())
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+    for test_string in test_vector.must_include:
+        assert test_string in stdout
+    for test_string in test_vector.must_not_include:
+        assert test_string not in stdout
+
+
+if __name__ == "__main__":
+    import sys
+    import tempfile
+
+    """Runs this file's tests from the command line."""
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        for test_function in [
+            test_output_to_stdout,
+            test_output_to_file,
+            test_input_from_stdin_without_hints,
+            test_convert_url,
+        ]:
+            for test_vector in CLI_TEST_VECTORS:
+                print(
+                    f"Running {test_function.__name__} on {test_vector.filename}...",
+                    end="",
+                )
+                test_function(tmp_dir, test_vector)
+                print("OK")
+    print("All tests passed!")