From 08330c2ac398a6b3e9f17598da4e7d97fdd1a4cf Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Thu, 27 Feb 2025 11:27:05 -0500 Subject: [PATCH] added core unit tests for file obj support --- packages/markitdown/tests/test_markitdown.py | 73 +++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index efd45ac..adb9495 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -189,7 +189,7 @@ def test_markitdown_remote() -> None: # assert test_string in result.text_content -def test_markitdown_local() -> None: +def test_markitdown_local_paths() -> None: markitdown = MarkItDown() # Test XLSX processing @@ -272,6 +272,75 @@ def test_markitdown_local() -> None: assert "# Test" in result.text_content +def test_markitdown_local_objects() -> None: + markitdown = MarkItDown() + + # Test XLSX processing + with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f: + result = markitdown.convert(f, file_extension=".xlsx") + validate_strings(result, XLSX_TEST_STRINGS) + + # Test XLS processing + with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f: + result = markitdown.convert(f, file_extension=".xls") + for test_string in XLS_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + + # Test DOCX processing + with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f: + result = markitdown.convert(f, file_extension=".docx") + validate_strings(result, DOCX_TEST_STRINGS) + + # Test DOCX processing, with comments + with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: + result = markitdown.convert( + f, + file_extension=".docx", + style_map="comment-reference => ", + ) + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) + + # Test DOCX processing, with comments and setting style_map on init + markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") + with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f: + result = markitdown_with_style_map.convert(f, file_extension=".docx") + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) + + # Test PPTX processing + with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f: + result = markitdown.convert(f, file_extension=".pptx") + validate_strings(result, PPTX_TEST_STRINGS) + + # Test HTML processing + with open( + os.path.join(TEST_FILES_DIR, "test_blog.html"), "rt", encoding="utf-8" + ) as f: + result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL) + validate_strings(result, BLOG_TEST_STRINGS) + + # Test Wikipedia processing + with open( + os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rt", encoding="utf-8" + ) as f: + result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL) + text_content = result.text_content.replace("\\", "") + validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) + + # Test Bing processing + with open( + os.path.join(TEST_FILES_DIR, "test_serp.html"), "rt", encoding="utf-8" + ) as f: + result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL) + text_content = result.text_content.replace("\\", "") + validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) + + # Test MSG (Outlook email) processing + with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f: + result = markitdown.convert(f, file_extension=".msg") + validate_strings(result, MSG_TEST_STRINGS) + + @pytest.mark.skipif( skip_exiftool, reason="do not run if exiftool is not installed", @@ -328,7 +397,7 @@ def test_markitdown_llm() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() - test_markitdown_local() + test_markitdown_local_paths() test_markitdown_exiftool() # test_markitdown_llm() print("All tests passed!")