Consider anything with a charset as plain text-convertible. (#1142)

This commit is contained in:
afourney
2025-03-19 20:46:35 -07:00
committed by GitHub
parent a93e0567e6
commit 716f74dcb9

View File

@@ -17,12 +17,16 @@ except ImportError:
ACCEPTED_MIME_TYPE_PREFIXES = [ ACCEPTED_MIME_TYPE_PREFIXES = [
"text/", "text/",
"application/json", "application/json",
"application/markdown",
] ]
# Mimetypes to ignore (commonly confused extensions) ACCEPTED_FILE_EXTENSIONS = [
IGNORE_MIME_TYPE_PREFIXES = [ ".txt",
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc. ".text",
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc. ".md",
".markdown",
".json",
".jsonl",
] ]
@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()
for prefix in IGNORE_MIME_TYPE_PREFIXES: # If we have a charset, we can safely assume it's text
if mimetype.startswith(prefix): # With Magika in the earlier stages, this handles most cases
return False if stream_info.charset is not None:
return True
# Otherwise, check the mimetype and extension
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES: for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix): if mimetype.startswith(prefix):