added file obj support to rss and plain text converters

This commit is contained in:
Kenny Zhang
2025-02-27 14:55:49 -05:00
parent 08330c2ac3
commit db0c8acbaf
3 changed files with 20 additions and 21 deletions

View File

@@ -175,9 +175,7 @@ class MarkItDown:
warn("Plugins converters are already enabled.", RuntimeWarning) warn("Plugins converters are already enabled.", RuntimeWarning)
def convert( def convert(
self, self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any
source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
**kwargs: Any,
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
""" """
Args: Args:
@@ -224,17 +222,19 @@ class MarkItDown:
# Convert # Convert
return self._convert(input, extensions, **kwargs) return self._convert(input, extensions, **kwargs)
def convert_file_object( def convert_file_object(
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: #TODO: deal with kwargs
# Prepare a list of extensions to try (in order of priority) # Prepare a list of extensions to try (in order of priority)
ext = kwargs.get("file_extension") ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else [] extensions = [ext] if ext is not None else []
# Get extension alternatives from puremagic # TODO: Curently, there are some ongoing issues with puremagic's magic_stream function (incorrect guesses, unsupported file types, etc.)
for g in self._guess_ext_magic(source=file_object): # Only use puremagic as a last resort if no extensions were provided
self._append_ext(extensions, g) if extensions == []:
for g in self._guess_ext_magic(source=file_object):
self._append_ext(extensions, g)
# Create the ConverterInput object # Create the ConverterInput object
input = ConverterInput(input_type="object", file_object=file_object) input = ConverterInput(input_type="object", file_object=file_object)
@@ -419,7 +419,7 @@ class MarkItDown:
# Use puremagic to guess # Use puremagic to guess
try: try:
guesses = [] guesses = []
# Guess extensions for filepaths # Guess extensions for filepaths
if isinstance(source, str): if isinstance(source, str):
guesses = puremagic.magic_file(source) guesses = puremagic.magic_file(source)
@@ -443,7 +443,7 @@ class MarkItDown:
pass pass
# Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source # Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
# TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not currently work) # TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work)
elif isinstance(source, BufferedIOBase): elif isinstance(source, BufferedIOBase):
guesses = puremagic.magic_stream(source) guesses = puremagic.magic_stream(source)

View File

@@ -1,6 +1,6 @@
import mimetypes import mimetypes
from charset_normalizer import from_path from charset_normalizer import from_path, from_bytes
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
@@ -18,10 +18,8 @@ class PlainTextConverter(DocumentConverter):
def convert( def convert(
self, input: ConverterInput, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if a local path is not provided # Read file object from input
if input.input_type != "filepath": file_obj = input.read_file(mode="rb")
return None
local_path = input.filepath
# Guess the content type from any file extension that might be around # Guess the content type from any file extension that might be around
content_type, _ = mimetypes.guess_type( content_type, _ = mimetypes.guess_type(
@@ -37,7 +35,8 @@ class PlainTextConverter(DocumentConverter):
): ):
return None return None
text_content = str(from_path(local_path).best()) text_content = str(from_bytes(file_obj.read()).best())
file_obj.close()
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=text_content, text_content=text_content,

View File

@@ -22,15 +22,15 @@ class RssConverter(DocumentConverter):
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]: if extension.lower() not in [".xml", ".rss", ".atom"]:
return None return None
# Bail if a local path is not provided # Read file object from input
if input.input_type != "filepath": file_obj = input.read_file(mode="rb")
return None
local_path = input.filepath
try: try:
doc = minidom.parse(local_path) doc = minidom.parse(file_obj)
except BaseException as _: except BaseException as _:
return None return None
file_obj.close()
result = None result = None
if doc.getElementsByTagName("rss"): if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss> # A RSS feed must have a root element of <rss>