Consider anything with a charset as plain text-convertible. (#1142)

2025-03-19 20:46:35 -07:00
parent a93e0567e6
commit 716f74dcb9
1 changed files with 16 additions and 7 deletions
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -17,12 +17,16 @@ except ImportError:
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/",
    "application/json",
    "application/markdown",
 ]
-# Mimetypes to ignore (commonly confused extensions)
+ACCEPTED_FILE_EXTENSIONS = [
-IGNORE_MIME_TYPE_PREFIXES = [
+    ".txt",
-    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
+    ".text",
-    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
+    ".md",
    ".markdown",
    ".json",
    ".jsonl",
 ]
@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
-        for prefix in IGNORE_MIME_TYPE_PREFIXES:
+        # If we have a charset, we can safely assume it's text
-            if mimetype.startswith(prefix):
+        # With Magika in the earlier stages, this handles most cases
-                return False
+        if stream_info.charset is not None:
            return True
        # Otherwise, check the mimetype and extension
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):