From db0c8acbaf9d158bdeb4d0ac13b60b3f88496e57 Mon Sep 17 00:00:00 2001 From: Kenny Zhang Date: Thu, 27 Feb 2025 14:55:49 -0500 Subject: [PATCH] added file obj support to rss and plain text converters --- .../markitdown/src/markitdown/_markitdown.py | 20 +++++++++---------- .../converters/_plain_text_converter.py | 11 +++++----- .../markitdown/converters/_rss_converter.py | 10 +++++----- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 188ab19..9072951 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -175,9 +175,7 @@ class MarkItDown: warn("Plugins converters are already enabled.", RuntimeWarning) def convert( - self, - source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], - **kwargs: Any, + self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs """ Args: @@ -224,17 +222,19 @@ class MarkItDown: # Convert return self._convert(input, extensions, **kwargs) - + def convert_file_object( self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any - ) -> DocumentConverterResult: # TODO: deal with kwargs + ) -> DocumentConverterResult: #TODO: deal with kwargs # Prepare a list of extensions to try (in order of priority) ext = kwargs.get("file_extension") extensions = [ext] if ext is not None else [] - # Get extension alternatives from puremagic - for g in self._guess_ext_magic(source=file_object): - self._append_ext(extensions, g) + # TODO: Curently, there are some ongoing issues with puremagic's magic_stream function (incorrect guesses, unsupported file types, etc.) + # Only use puremagic as a last resort if no extensions were provided + if extensions == []: + for g in self._guess_ext_magic(source=file_object): + self._append_ext(extensions, g) # Create the ConverterInput object input = ConverterInput(input_type="object", file_object=file_object) @@ -419,7 +419,7 @@ class MarkItDown: # Use puremagic to guess try: guesses = [] - + # Guess extensions for filepaths if isinstance(source, str): guesses = puremagic.magic_file(source) @@ -443,7 +443,7 @@ class MarkItDown: pass # Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source - # TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not currently work) + # TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work) elif isinstance(source, BufferedIOBase): guesses = puremagic.magic_stream(source) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index b23db82..22d851b 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -1,6 +1,6 @@ import mimetypes -from charset_normalizer import from_path +from charset_normalizer import from_path, from_bytes from typing import Any, Union from ._base import DocumentConverter, DocumentConverterResult @@ -18,10 +18,8 @@ class PlainTextConverter(DocumentConverter): def convert( self, input: ConverterInput, **kwargs: Any ) -> Union[None, DocumentConverterResult]: - # Bail if a local path is not provided - if input.input_type != "filepath": - return None - local_path = input.filepath + # Read file object from input + file_obj = input.read_file(mode="rb") # Guess the content type from any file extension that might be around content_type, _ = mimetypes.guess_type( @@ -37,7 +35,8 @@ class PlainTextConverter(DocumentConverter): ): return None - text_content = str(from_path(local_path).best()) + text_content = str(from_bytes(file_obj.read()).best()) + file_obj.close() return DocumentConverterResult( title=None, text_content=text_content, diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 89f41c0..84944c6 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -22,15 +22,15 @@ class RssConverter(DocumentConverter): extension = kwargs.get("file_extension", "") if extension.lower() not in [".xml", ".rss", ".atom"]: return None - # Bail if a local path is not provided - if input.input_type != "filepath": - return None - local_path = input.filepath + # Read file object from input + file_obj = input.read_file(mode="rb") try: - doc = minidom.parse(local_path) + doc = minidom.parse(file_obj) except BaseException as _: return None + file_obj.close() + result = None if doc.getElementsByTagName("rss"): # A RSS feed must have a root element of