From db0c8acbaf9d158bdeb4d0ac13b60b3f88496e57 Mon Sep 17 00:00:00 2001
From: Kenny Zhang <kzhang678@gmail.com>
Date: Thu, 27 Feb 2025 14:55:49 -0500
Subject: [PATCH] added file obj support to rss and plain text converters

---
 .../markitdown/src/markitdown/_markitdown.py  | 20 +++++++++----------
 .../converters/_plain_text_converter.py       | 11 +++++-----
 .../markitdown/converters/_rss_converter.py   | 10 +++++-----
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index 188ab19..9072951 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -175,9 +175,7 @@ class MarkItDown:
             warn("Plugins converters are already enabled.", RuntimeWarning)
 
     def convert(
-        self,
-        source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
-        **kwargs: Any,
+        self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any
     ) -> DocumentConverterResult:  # TODO: deal with kwargs
         """
         Args:
@@ -224,17 +222,19 @@ class MarkItDown:
 
         # Convert
         return self._convert(input, extensions, **kwargs)
-
+    
     def convert_file_object(
         self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+    ) -> DocumentConverterResult: #TODO: deal with kwargs
         # Prepare a list of extensions to try (in order of priority)
         ext = kwargs.get("file_extension")
         extensions = [ext] if ext is not None else []
 
-        # Get extension alternatives from puremagic
-        for g in self._guess_ext_magic(source=file_object):
-            self._append_ext(extensions, g)
+        # TODO: Curently, there are some ongoing issues with puremagic's magic_stream function (incorrect guesses, unsupported file types, etc.)
+        # Only use puremagic as a last resort if no extensions were provided
+        if extensions == []:
+            for g in self._guess_ext_magic(source=file_object):
+                self._append_ext(extensions, g)
 
         # Create the ConverterInput object
         input = ConverterInput(input_type="object", file_object=file_object)
@@ -419,7 +419,7 @@ class MarkItDown:
         # Use puremagic to guess
         try:
             guesses = []
-
+            
             # Guess extensions for filepaths
             if isinstance(source, str):
                 guesses = puremagic.magic_file(source)
@@ -443,7 +443,7 @@ class MarkItDown:
                             pass
 
             # Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
-            # TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not currently work)
+            # TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work)
             elif isinstance(source, BufferedIOBase):
                 guesses = puremagic.magic_stream(source)
 
diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
index b23db82..22d851b 100644
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -1,6 +1,6 @@
 import mimetypes
 
-from charset_normalizer import from_path
+from charset_normalizer import from_path, from_bytes
 from typing import Any, Union
 
 from ._base import DocumentConverter, DocumentConverterResult
@@ -18,10 +18,8 @@ class PlainTextConverter(DocumentConverter):
     def convert(
         self, input: ConverterInput, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
-        # Bail if a local path is not provided
-        if input.input_type != "filepath":
-            return None
-        local_path = input.filepath
+        # Read file object from input
+        file_obj = input.read_file(mode="rb")
 
         # Guess the content type from any file extension that might be around
         content_type, _ = mimetypes.guess_type(
@@ -37,7 +35,8 @@ class PlainTextConverter(DocumentConverter):
         ):
             return None
 
-        text_content = str(from_path(local_path).best())
+        text_content = str(from_bytes(file_obj.read()).best())
+        file_obj.close()
         return DocumentConverterResult(
             title=None,
             text_content=text_content,
diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py
index 89f41c0..84944c6 100644
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -22,15 +22,15 @@ class RssConverter(DocumentConverter):
         extension = kwargs.get("file_extension", "")
         if extension.lower() not in [".xml", ".rss", ".atom"]:
             return None
-        # Bail if a local path is not provided
-        if input.input_type != "filepath":
-            return None
-        local_path = input.filepath
+        # Read file object from input
+        file_obj = input.read_file(mode="rb")
 
         try:
-            doc = minidom.parse(local_path)
+            doc = minidom.parse(file_obj)
         except BaseException as _:
             return None
+        file_obj.close()
+
         result = None
         if doc.getElementsByTagName("rss"):
             # A RSS feed must have a root element of <rss>