Fix character decoding issues with text-like files

This commit is contained in:
Divyansh Singh
2024-12-15 10:37:15 +05:30
parent 81e3f24acd
commit 52b723724c
4 changed files with 20 additions and 3 deletions

View File

@@ -26,6 +26,7 @@ import pptx
import puremagic
import requests
from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Optional Transcription support
try:
@@ -161,9 +162,7 @@ class PlainTextConverter(DocumentConverter):
elif "text/" not in content_type.lower():
return None
text_content = ""
with open(local_path, "rt", encoding="utf-8") as fh:
text_content = fh.read()
text_content = str(from_path(local_path).best())
return DocumentConverterResult(
title=None,
text_content=text_content,