Merge pull request #19 from brc-dd/fix/18
Fix character decoding issues with text-like files
This commit is contained in:
@@ -38,6 +38,7 @@ dependencies = [
|
||||
"youtube-transcript-api",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
||||
@@ -26,6 +26,7 @@ import pptx
|
||||
import puremagic
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from charset_normalizer import from_path
|
||||
|
||||
# Optional Transcription support
|
||||
try:
|
||||
@@ -161,9 +162,7 @@ class PlainTextConverter(DocumentConverter):
|
||||
elif "text/" not in content_type.lower():
|
||||
return None
|
||||
|
||||
text_content = ""
|
||||
with open(local_path, "rt", encoding="utf-8") as fh:
|
||||
text_content = fh.read()
|
||||
text_content = str(from_path(local_path).best())
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=text_content,
|
||||
|
||||
4
tests/test_files/test_mskanji.csv
Normal file
4
tests/test_files/test_mskanji.csv
Normal file
@@ -0,0 +1,4 @@
|
||||
<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
|
||||
|
@@ -87,6 +87,13 @@ SERP_TEST_EXCLUDES = [
|
||||
"data:image/svg+xml,%3Csvg%20width%3D",
|
||||
]
|
||||
|
||||
CSV_CP932_TEST_STRINGS = [
|
||||
"名前,年齢,住所",
|
||||
"佐藤太郎,30,東京",
|
||||
"三木英子,25,大阪",
|
||||
"髙橋淳,35,名古屋",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
@@ -164,6 +171,12 @@ def test_markitdown_local() -> None:
|
||||
for test_string in SERP_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
## Test non-UTF-8 encoding
|
||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for test_string in CSV_CP932_TEST_STRINGS:
|
||||
assert test_string in text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_exiftool,
|
||||
|
||||
Reference in New Issue
Block a user