Consider anything with a charset as plain text-convertible. (#1142)
This commit is contained in:
@@ -17,12 +17,16 @@ except ImportError:
|
|||||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
"text/",
|
"text/",
|
||||||
"application/json",
|
"application/json",
|
||||||
|
"application/markdown",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Mimetypes to ignore (commonly confused extensions)
|
ACCEPTED_FILE_EXTENSIONS = [
|
||||||
IGNORE_MIME_TYPE_PREFIXES = [
|
".txt",
|
||||||
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
".text",
|
||||||
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
".md",
|
||||||
|
".markdown",
|
||||||
|
".json",
|
||||||
|
".jsonl",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
for prefix in IGNORE_MIME_TYPE_PREFIXES:
|
# If we have a charset, we can safely assume it's text
|
||||||
if mimetype.startswith(prefix):
|
# With Magika in the earlier stages, this handles most cases
|
||||||
return False
|
if stream_info.charset is not None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Otherwise, check the mimetype and extension
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
if mimetype.startswith(prefix):
|
if mimetype.startswith(prefix):
|
||||||
|
|||||||
Reference in New Issue
Block a user