#!/usr/bin/env python3 -m pytest import io import os import re import shutil import openai import pytest from markitdown._uri_utils import parse_data_uri, file_uri_to_path from markitdown import ( MarkItDown, UnsupportedFormatException, FileConversionException, StreamInfo, ) # This file contains module tests that are not directly tested by the FileTestVectors. # This includes things like helper functions and runtime conversion options # (e.g., LLM clients, exiftool path, transcription services, etc.) skip_remote = ( True if os.environ.get("GITHUB_ACTIONS") else False ) # Don't run these tests in CI # Don't run the llm tests without a key and the client library skip_llm = False if os.environ.get("OPENAI_API_KEY") else True try: import openai except ModuleNotFoundError: skip_llm = True # Skip exiftool tests if not installed skip_exiftool = shutil.which("exiftool") is None TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") JPG_TEST_EXIFTOOL = { "Author": "AutoGen Authors", "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "Description": "AutoGen enables diverse LLM-based applications", "ImageSize": "1615x1967", "DateTimeOriginal": "2024:03:14 22:10:00", } MP3_TEST_EXIFTOOL = { "Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e", "Artist": "Artist Name Test String", "Album": "Album Name Test String", "SampleRate": "48000", } PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf" PDF_TEST_STRINGS = [ "While there is contemporaneous exploration of multi-agent approaches" ] YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg" YOUTUBE_TEST_STRINGS = [ "## AutoGen FULL Tutorial with Python (Step-By-Step)", "This is an intermediate tutorial for installing and using AutoGen locally", "PT15M4S", "the model we're going to be using today is GPT 3.5 turbo", # From the transcript ] DOCX_COMMENT_TEST_STRINGS = [ "314b0a30-5b04-470b-b9f7-eed2c2bec74a", "49e168b7-d2ae-407f-a055-2167576f39a1", "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", "# Abstract", "# Introduction", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "This is a test comment. 12df-321a", "Yet another comment in the doc. 55yiyi-asd09", ] BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_STRINGS = [ "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", "an example where high cost can easily prevent a generic complex", ] LLM_TEST_STRINGS = [ "5bda1dd6", ] PPTX_TEST_STRINGS = [ "2cdda5c8-e50e-4db4-b5f0-9722a649f455", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "1b92870d-e3b5-4e65-8153-919f4ff45592", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value ] # --- Helper Functions --- def validate_strings(result, expected_strings, exclude_strings=None): """Validate presence or absence of specific strings.""" text_content = result.text_content.replace("\\", "") for string in expected_strings: assert string in text_content if exclude_strings: for string in exclude_strings: assert string not in text_content def test_stream_info_operations() -> None: """Test operations performed on StreamInfo objects.""" stream_info_original = StreamInfo( mimetype="mimetype.1", extension="extension.1", charset="charset.1", filename="filename.1", local_path="local_path.1", url="url.1", ) # Check updating all attributes by keyword keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] for keyword in keywords: updated_stream_info = stream_info_original.copy_and_update( **{keyword: f"{keyword}.2"} ) # Make sure the targted attribute is updated assert getattr(updated_stream_info, keyword) == f"{keyword}.2" # Make sure the other attributes are unchanged for k in keywords: if k != keyword: assert getattr(stream_info_original, k) == getattr( updated_stream_info, k ) # Check updating all attributes by passing a new StreamInfo object keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] for keyword in keywords: updated_stream_info = stream_info_original.copy_and_update( StreamInfo(**{keyword: f"{keyword}.2"}) ) # Make sure the targted attribute is updated assert getattr(updated_stream_info, keyword) == f"{keyword}.2" # Make sure the other attributes are unchanged for k in keywords: if k != keyword: assert getattr(stream_info_original, k) == getattr( updated_stream_info, k ) # Check mixing and matching updated_stream_info = stream_info_original.copy_and_update( StreamInfo(extension="extension.2", filename="filename.2"), mimetype="mimetype.3", charset="charset.3", ) assert updated_stream_info.extension == "extension.2" assert updated_stream_info.filename == "filename.2" assert updated_stream_info.mimetype == "mimetype.3" assert updated_stream_info.charset == "charset.3" assert updated_stream_info.local_path == "local_path.1" assert updated_stream_info.url == "url.1" # Check multiple StreamInfo objects updated_stream_info = stream_info_original.copy_and_update( StreamInfo(extension="extension.4", filename="filename.5"), StreamInfo(mimetype="mimetype.6", charset="charset.7"), ) assert updated_stream_info.extension == "extension.4" assert updated_stream_info.filename == "filename.5" assert updated_stream_info.mimetype == "mimetype.6" assert updated_stream_info.charset == "charset.7" assert updated_stream_info.local_path == "local_path.1" assert updated_stream_info.url == "url.1" def test_data_uris() -> None: # Test basic parsing of data URIs data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type is None assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ==" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 1 assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" data_uri = "data:,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type is None assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 0 assert data == b"Hello, World!" data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21" mime_type, attributes, data = parse_data_uri(data_uri) assert mime_type == "text/plain" assert len(attributes) == 1 assert attributes["charset"] == "utf-8" assert data == b"Hello, World!" def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with no host file_uri = "file:/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with localhost file_uri = "file://localhost/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc == "localhost" assert path == "/path/to/file.txt" # Test file URI with query parameters file_uri = "file:///path/to/file.txt?param=value" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" # Test file URI with fragment file_uri = "file:///path/to/file.txt#fragment" netloc, path = file_uri_to_path(file_uri) assert netloc is None assert path == "/path/to/file.txt" def test_docx_comments() -> None: markitdown = MarkItDown() # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") result = markitdown_with_style_map.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx") ) validate_strings(result, DOCX_COMMENT_TEST_STRINGS) def test_docx_equations() -> None: markitdown = MarkItDown() docx_file = os.path.join(TEST_FILES_DIR, "equations.docx") result = markitdown.convert(docx_file) # Check for inline equation m=1 (wrapped with single $) is present assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" # Find block equations wrapped with double $$ and check if they are present block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content) assert block_equations, "No block equations found in the document." def test_input_as_strings() -> None: markitdown = MarkItDown() # Test input from a stream input_data = b"