From 52d73080c7ae3d7662d04f5cc520a09de1908878 Mon Sep 17 00:00:00 2001 From: lumin <71011125+l-lumin@users.noreply.github.com> Date: Sat, 21 Dec 2024 04:42:32 +0900 Subject: [PATCH] refactor(tests): add helper function for tests (#87) * refactor(tests): simplify string validation in tests Introduce a helper function `validate_strings` to streamline the validation of expected and excluded strings in test cases. Replace repetitive string assertions in the `test_markitdown_local` function with calls to this new helper, improving code readability and maintainability. * run pre-commit --------- Co-authored-by: lumin <71011125+l-melon@users.noreply.github.com> Co-authored-by: gagb --- tests/test_markitdown.py | 53 ++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 316e670..4a981bd 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -131,6 +131,17 @@ LLM_TEST_STRINGS = [ ] +# --- Helper Functions --- +def validate_strings(result, expected_strings, exclude_strings=None): + """Validate presence or absence of specific strings.""" + text_content = result.text_content.replace("\\", "") + for string in expected_strings: + assert string in text_content + if exclude_strings: + for string in exclude_strings: + assert string not in text_content + + @pytest.mark.skipif( skip_remote, reason="do not run tests that query external urls", @@ -163,73 +174,53 @@ def test_markitdown_local() -> None: # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) - for test_string in XLSX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, XLSX_TEST_STRINGS) # Test DOCX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx")) - for test_string in DOCX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, DOCX_TEST_STRINGS) # Test DOCX processing, with comments result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), style_map="comment-reference => ", ) - for test_string in DOCX_COMMENT_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) # Test DOCX processing, with comments and setting style_map on init markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") result = markitdown_with_style_map.convert( os.path.join(TEST_FILES_DIR, "test_with_comment.docx") ) - for test_string in DOCX_COMMENT_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, DOCX_COMMENT_TEST_STRINGS) # Test PPTX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) - for test_string in PPTX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, PPTX_TEST_STRINGS) # Test HTML processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL ) - for test_string in BLOG_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, BLOG_TEST_STRINGS) # Test ZIP file processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) - for test_string in DOCX_TEST_STRINGS: - text_content = result.text_content.replace("\\", "") - assert test_string in text_content + validate_strings(result, XLSX_TEST_STRINGS) # Test Wikipedia processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL ) text_content = result.text_content.replace("\\", "") - for test_string in WIKIPEDIA_TEST_EXCLUDES: - assert test_string not in text_content - for test_string in WIKIPEDIA_TEST_STRINGS: - assert test_string in text_content + validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES) # Test Bing processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL ) text_content = result.text_content.replace("\\", "") - for test_string in SERP_TEST_EXCLUDES: - assert test_string not in text_content - for test_string in SERP_TEST_STRINGS: - assert test_string in text_content + validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES) # Test RSS processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml")) @@ -239,9 +230,7 @@ def test_markitdown_local() -> None: ## Test non-UTF-8 encoding result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) - text_content = result.text_content.replace("\\", "") - for test_string in CSV_CP932_TEST_STRINGS: - assert test_string in text_content + validate_strings(result, CSV_CP932_TEST_STRINGS) @pytest.mark.skipif(