refactor(tests): add helper function for tests (#87)
* refactor(tests): simplify string validation in tests Introduce a helper function `validate_strings` to streamline the validation of expected and excluded strings in test cases. Replace repetitive string assertions in the `test_markitdown_local` function with calls to this new helper, improving code readability and maintainability. * run pre-commit --------- Co-authored-by: lumin <71011125+l-melon@users.noreply.github.com> Co-authored-by: gagb <gagb@users.noreply.github.com>
This commit is contained in:
@@ -131,6 +131,17 @@ LLM_TEST_STRINGS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# --- Helper Functions ---
|
||||||
|
def validate_strings(result, expected_strings, exclude_strings=None):
|
||||||
|
"""Validate presence or absence of specific strings."""
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
for string in expected_strings:
|
||||||
|
assert string in text_content
|
||||||
|
if exclude_strings:
|
||||||
|
for string in exclude_strings:
|
||||||
|
assert string not in text_content
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_remote,
|
skip_remote,
|
||||||
reason="do not run tests that query external urls",
|
reason="do not run tests that query external urls",
|
||||||
@@ -163,73 +174,53 @@ def test_markitdown_local() -> None:
|
|||||||
|
|
||||||
# Test XLSX processing
|
# Test XLSX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
|
||||||
for test_string in XLSX_TEST_STRINGS:
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test DOCX processing
|
# Test DOCX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
|
||||||
for test_string in DOCX_TEST_STRINGS:
|
validate_strings(result, DOCX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test DOCX processing, with comments
|
# Test DOCX processing, with comments
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
|
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
|
||||||
style_map="comment-reference => ",
|
style_map="comment-reference => ",
|
||||||
)
|
)
|
||||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test DOCX processing, with comments and setting style_map on init
|
# Test DOCX processing, with comments and setting style_map on init
|
||||||
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
|
||||||
result = markitdown_with_style_map.convert(
|
result = markitdown_with_style_map.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
|
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
|
||||||
)
|
)
|
||||||
for test_string in DOCX_COMMENT_TEST_STRINGS:
|
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test PPTX processing
|
# Test PPTX processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
|
||||||
for test_string in PPTX_TEST_STRINGS:
|
validate_strings(result, PPTX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test HTML processing
|
# Test HTML processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
|
||||||
)
|
)
|
||||||
for test_string in BLOG_TEST_STRINGS:
|
validate_strings(result, BLOG_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test ZIP file processing
|
# Test ZIP file processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
||||||
for test_string in DOCX_TEST_STRINGS:
|
validate_strings(result, XLSX_TEST_STRINGS)
|
||||||
text_content = result.text_content.replace("\\", "")
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test Wikipedia processing
|
# Test Wikipedia processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
||||||
)
|
)
|
||||||
text_content = result.text_content.replace("\\", "")
|
text_content = result.text_content.replace("\\", "")
|
||||||
for test_string in WIKIPEDIA_TEST_EXCLUDES:
|
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
|
||||||
assert test_string not in text_content
|
|
||||||
for test_string in WIKIPEDIA_TEST_STRINGS:
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test Bing processing
|
# Test Bing processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
|
||||||
)
|
)
|
||||||
text_content = result.text_content.replace("\\", "")
|
text_content = result.text_content.replace("\\", "")
|
||||||
for test_string in SERP_TEST_EXCLUDES:
|
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
|
||||||
assert test_string not in text_content
|
|
||||||
for test_string in SERP_TEST_STRINGS:
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
# Test RSS processing
|
# Test RSS processing
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
|
||||||
@@ -239,9 +230,7 @@ def test_markitdown_local() -> None:
|
|||||||
|
|
||||||
## Test non-UTF-8 encoding
|
## Test non-UTF-8 encoding
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||||
text_content = result.text_content.replace("\\", "")
|
validate_strings(result, CSV_CP932_TEST_STRINGS)
|
||||||
for test_string in CSV_CP932_TEST_STRINGS:
|
|
||||||
assert test_string in text_content
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
|
|||||||
Reference in New Issue
Block a user