Fix character decoding issues with text-like files

This commit is contained in:
Divyansh Singh
2024-12-15 10:37:15 +05:30
parent 81e3f24acd
commit 52b723724c
4 changed files with 20 additions and 3 deletions

View File

@@ -87,6 +87,13 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D",
]
CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]
@pytest.mark.skipif(
skip_remote,
@@ -164,6 +171,12 @@ def test_markitdown_local() -> None:
for test_string in SERP_TEST_STRINGS:
assert test_string in text_content
## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
text_content = result.text_content.replace("\\", "")
for test_string in CSV_CP932_TEST_STRINGS:
assert test_string in text_content
@pytest.mark.skipif(
skip_exiftool,