#!/usr/bin/env python3 -m pytest import io import os import re import shutil import openai import pytest from markitdown._uri_utils import parse_data_uri, file_uri_to_path from markitdown import ( MarkItDown, UnsupportedFormatException, FileConversionException, StreamInfo, ) # This file contains module tests that are not directly tested by the FileTestVectors. # This includes things like helper functions and runtime conversion options # (e.g., LLM clients, exiftool path, transcription services, etc.) skip_remote = ( True if os.environ.get("GITHUB_ACTIONS") else False ) # Don't run these tests in CI # Don't run the llm tests without a key and the client library skip_llm = False if os.environ.get("OPENAI_API_KEY") else True try: import openai except ModuleNotFoundError: skip_llm = True # Skip exiftool tests if not installed skip_exiftool = shutil.which("exiftool") is None TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") JPG_TEST_EXIFTOOL = { "Author": "AutoGen Authors", "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "Description": "AutoGen enables diverse LLM-based applications", "ImageSize": "1615x1967", "DateTimeOriginal": "2024:03:14 22:10:00", } MP3_TEST_EXIFTOOL = { "Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e", "Artist": "Artist Name Test String", "Album": "Album Name Test String", "SampleRate": "48000", } PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf" PDF_TEST_STRINGS = [ "While there is contemporaneous exploration of multi-agent approaches" ] YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg" YOUTUBE_TEST_STRINGS = [ "## AutoGen FULL Tutorial with Python (Step-By-Step)", "This is an intermediate tutorial for installing and using AutoGen locally", "PT15M4S", "the model we're going to be using today is GPT 3.5 turbo", # From the transcript ] DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", "# Abstract", "# Introduction", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "This is a test comment. 12df-321a", "Yet another comment in the doc. 55yiyi-asd09", ] BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_STRINGS = [ "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", "an example where high cost can easily prevent a generic complex", ] LLM_TEST_STRINGS = [ "5bda1dd6", ] PPTX_TEST_STRINGS = [ "2cdda5c8-e50e-4db4-b5f0-9722a649f455", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "1b92870d-e3b5-4e65-8153-919f4ff45592", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value ] # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): """Validate presence or absence of specific strings.""" text_content = result.text_content.replace("\\", "") for string in expected_strings: assert string in text_content if exclude_strings: for string in exclude_strings: assert string not in text_content def test_stream_info_operations() -> None: """Test operations performed on StreamInfo objects.""" stream_info_original = StreamInfo( mimetype="mimetype.1", extension="extension.1", charset="charset.1", filename="filename.1", local_path="local_path.1", url="url.1", ) # Check updating all attributes by keyword keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] for keyword in keywords: updated_stream_info = stream_info_original.copy_and_update( **{keyword: f"{keyword}.2"} ) # Make sure the targted attribute is updated assert getattr(updated_stream_info, keyword) == f"{keyword}.2" # Make sure the other attributes are unchanged for k in keywords: if k != keyword: assert getattr(stream_info_original, k) == getattr( updated_stream_info, k ) # Check updating all attributes by passing a new StreamInfo object keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] for keyword in keywords: updated_stream_info = stream_info_original.copy_and_update( StreamInfo(**{keyword: f"{keyword}.2"}) ) # Make sure the targted attribute is updated assert getattr(updated_stream_info, keyword) == f"{keyword}.2" # Make sure the other attributes are unchanged for k in keywords: if k != keyword: assert getattr(stream_info_original, k) == getattr( updated_stream_info, k ) # Check mixing and matching updated_stream_info = stream_info_original.copy_and_update( StreamInfo(extension="extension.2", filename="filename.2"), mimetype="mimetype.3", charset="charset.3", ) assert updated_stream_info.extension == "extension.2" assert updated_stream_info.filename == "filename.2" assert updated_stream_info.mimetype == "mimetype.3" assert updated_stream_info.charset == "charset.3" assert updated_stream_info.local_path == "local_path.1" assert updated_stream_info.url == "url.1" # Check multiple StreamInfo objects updated_stream_info = stream_info_original.copy_and_update( StreamInfo(extension="extension.4", filename="filename.5"), StreamInfo(mimetype="mimetype.6", charset="charset.7"), ) assert updated_stream_info.extension == "extension.4" assert updated_stream_info.filename == "filename.5" assert updated_stream_info.mimetype == "mimetype.6" assert updated_stream_info.charset == "charset.7" assert updated_stream_info.local_path == "local_path.1" assert updated_stream_info.url == "url.1" def test_data_uris() -> None: # Test basic parsing of data URIs data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type is None assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 1 assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" data_uri = "data:,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type is None assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 1 assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with no host file_uri = "file:/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with localhost file_uri = "file://localhost/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc == "localhost" assert path == "/path/to/file.txt" # Test file URI with query parameters file_uri = "file:///path/to/file.txt?param=value" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with fragment file_uri = "file:///path/to/file.txt#fragment" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" def test_docx_comments() -> None: markitdown = MarkItDown() # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") result = markitdown_with_style_map.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx") ) validate_strings(result, DOCX_COMMENT_TEST_STRINGS) def test_docx_equations() -> None: markitdown = MarkItDown() docx_file = os.path.join(TEST_FILES_DIR, "equations.docx") result = markitdown.convert(docx_file) # Check for inline equation m=1 (wrapped with single $) is present assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" # Find block equations wrapped with double $$ and check if they are present block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content) assert block_equations, "No block equations found in the document." def test_input_as_strings() -> None: markitdown = MarkItDown() # Test input from a stream input_data = b"

Test

" result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content # Test input with leading blank characters input_data = b" \n\n\n

Test

" result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content @pytest.mark.skipif( skip_remote, reason="do not run tests that query external urls", ) def test_markitdown_remote() -> None: markitdown = MarkItDown() # By URL result = markitdown.convert(PDF_TEST_URL) for test_string in PDF_TEST_STRINGS: assert test_string in result.text_content # Youtube result = markitdown.convert(YOUTUBE_TEST_URL) for test_string in YOUTUBE_TEST_STRINGS: assert test_string in result.text_content @pytest.mark.skipif( skip_remote, reason="do not run remotely run speech transcription tests", ) def test_speech_transcription() -> None: markitdown = MarkItDown() # Test WAV files, MP3 and M4A files for file_name in ["test.wav", "test.mp3", "test.m4a"]: result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name)) result_lower = result.text_content.lower() assert ( ("1" in result_lower or "one" in result_lower) and ("2" in result_lower or "two" in result_lower) and ("3" in result_lower or "three" in result_lower) and ("4" in result_lower or "four" in result_lower) and ("5" in result_lower or "five" in result_lower) ) def test_exceptions() -> None: # Check that an exception is raised when trying to convert an unsupported format markitdown = MarkItDown() with pytest.raises(UnsupportedFormatException): markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin")) # Check that an exception is raised when trying to convert a file that is corrupted with pytest.raises(FileConversionException) as exc_info: markitdown.convert( os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx" ) assert len(exc_info.value.attempts) == 1 assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed", ) def test_markitdown_exiftool() -> None: which_exiftool = shutil.which("exiftool") assert which_exiftool is not None # Test explicitly setting the location of exiftool markitdown = MarkItDown(exiftool_path=which_exiftool) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" assert target in result.text_content # Test setting the exiftool path through an environment variable os.environ["EXIFTOOL_PATH"] = which_exiftool markitdown = MarkItDown() result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) for key in JPG_TEST_EXIFTOOL: target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" assert target in result.text_content # Test some other media types result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3")) for key in MP3_TEST_EXIFTOOL: target = f"{key}: {MP3_TEST_EXIFTOOL[key]}" assert target in result.text_content @pytest.mark.skipif( skip_llm, reason="do not run llm tests without a key", ) def test_markitdown_llm() -> None: client = openai.OpenAI() markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) for test_string in LLM_TEST_STRINGS: assert test_string in result.text_content # This is not super precise. It would also accept "red square", "blue circle", # "the square is not blue", etc. But it's sufficient for this test. for test_string in ["red", "circle", "blue", "square"]: assert test_string in result.text_content.lower() # Images embedded in PPTX files result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) # LLM Captions are included for test_string in LLM_TEST_STRINGS: assert test_string in result.text_content # Standard alt text is included validate_strings(result, PPTX_TEST_STRINGS) if __name__ == "__main__": """Runs this file's tests from the command line.""" for test in [ test_stream_info_operations, test_data_uris, test_file_uris, test_docx_comments, test_input_as_strings, test_markitdown_remote, test_speech_transcription, test_exceptions, test_markitdown_exiftool, test_markitdown_llm, ]: print(f"Running {test.__name__}...", end="") test() print("OK") print("All tests passed!")