From 716f74dcb92b622c023c78c9a50761bc63b0f598 Mon Sep 17 00:00:00 2001 From: afourney Date: Wed, 19 Mar 2025 20:46:35 -0700 Subject: [PATCH] Consider anything with a charset as plain text-convertible. (#1142) --- .../converters/_plain_text_converter.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 4a21d3a..2e10405 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -17,12 +17,16 @@ except ImportError: ACCEPTED_MIME_TYPE_PREFIXES = [ "text/", "application/json", + "application/markdown", ] -# Mimetypes to ignore (commonly confused extensions) -IGNORE_MIME_TYPE_PREFIXES = [ - "text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc. - "text/vnd.graphviz", # .dot which is confused with xls, doc, etc. +ACCEPTED_FILE_EXTENSIONS = [ + ".txt", + ".text", + ".md", + ".markdown", + ".json", + ".jsonl", ] @@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter): mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() - for prefix in IGNORE_MIME_TYPE_PREFIXES: - if mimetype.startswith(prefix): - return False + # If we have a charset, we can safely assume it's text + # With Magika in the earlier stages, this handles most cases + if stream_info.charset is not None: + return True + + # Otherwise, check the mimetype and extension + if extension in ACCEPTED_FILE_EXTENSIONS: + return True for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix):