From 716f74dcb92b622c023c78c9a50761bc63b0f598 Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Wed, 19 Mar 2025 20:46:35 -0700
Subject: [PATCH] Consider anything with a charset as plain text-convertible.
 (#1142)

---
 .../converters/_plain_text_converter.py       | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
index 4a21d3a..2e10405 100644
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -17,12 +17,16 @@ except ImportError:
 ACCEPTED_MIME_TYPE_PREFIXES = [
     "text/",
     "application/json",
+    "application/markdown",
 ]
 
-# Mimetypes to ignore (commonly confused extensions)
-IGNORE_MIME_TYPE_PREFIXES = [
-    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
-    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
+ACCEPTED_FILE_EXTENSIONS = [
+    ".txt",
+    ".text",
+    ".md",
+    ".markdown",
+    ".json",
+    ".jsonl",
 ]
 
 
@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
 
-        for prefix in IGNORE_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return False
+        # If we have a charset, we can safely assume it's text
+        # With Magika in the earlier stages, this handles most cases
+        if stream_info.charset is not None:
+            return True
+
+        # Otherwise, check the mimetype and extension
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
 
         for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
             if mimetype.startswith(prefix):