added file obj support to rss and plain text converters
This commit is contained in:
@@ -175,9 +175,7 @@ class MarkItDown:
|
|||||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self, source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase], **kwargs: Any
|
||||||
source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
@@ -224,17 +222,19 @@ class MarkItDown:
|
|||||||
|
|
||||||
# Convert
|
# Convert
|
||||||
return self._convert(input, extensions, **kwargs)
|
return self._convert(input, extensions, **kwargs)
|
||||||
|
|
||||||
def convert_file_object(
|
def convert_file_object(
|
||||||
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
|
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
|
||||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
) -> DocumentConverterResult: #TODO: deal with kwargs
|
||||||
# Prepare a list of extensions to try (in order of priority)
|
# Prepare a list of extensions to try (in order of priority)
|
||||||
ext = kwargs.get("file_extension")
|
ext = kwargs.get("file_extension")
|
||||||
extensions = [ext] if ext is not None else []
|
extensions = [ext] if ext is not None else []
|
||||||
|
|
||||||
# Get extension alternatives from puremagic
|
# TODO: Curently, there are some ongoing issues with puremagic's magic_stream function (incorrect guesses, unsupported file types, etc.)
|
||||||
for g in self._guess_ext_magic(source=file_object):
|
# Only use puremagic as a last resort if no extensions were provided
|
||||||
self._append_ext(extensions, g)
|
if extensions == []:
|
||||||
|
for g in self._guess_ext_magic(source=file_object):
|
||||||
|
self._append_ext(extensions, g)
|
||||||
|
|
||||||
# Create the ConverterInput object
|
# Create the ConverterInput object
|
||||||
input = ConverterInput(input_type="object", file_object=file_object)
|
input = ConverterInput(input_type="object", file_object=file_object)
|
||||||
@@ -419,7 +419,7 @@ class MarkItDown:
|
|||||||
# Use puremagic to guess
|
# Use puremagic to guess
|
||||||
try:
|
try:
|
||||||
guesses = []
|
guesses = []
|
||||||
|
|
||||||
# Guess extensions for filepaths
|
# Guess extensions for filepaths
|
||||||
if isinstance(source, str):
|
if isinstance(source, str):
|
||||||
guesses = puremagic.magic_file(source)
|
guesses = puremagic.magic_file(source)
|
||||||
@@ -443,7 +443,7 @@ class MarkItDown:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
|
# Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
|
||||||
# TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not currently work)
|
# TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work)
|
||||||
elif isinstance(source, BufferedIOBase):
|
elif isinstance(source, BufferedIOBase):
|
||||||
guesses = puremagic.magic_stream(source)
|
guesses = puremagic.magic_stream(source)
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from charset_normalizer import from_path
|
from charset_normalizer import from_path, from_bytes
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from ._base import DocumentConverter, DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
@@ -18,10 +18,8 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
def convert(
|
def convert(
|
||||||
self, input: ConverterInput, **kwargs: Any
|
self, input: ConverterInput, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if a local path is not provided
|
# Read file object from input
|
||||||
if input.input_type != "filepath":
|
file_obj = input.read_file(mode="rb")
|
||||||
return None
|
|
||||||
local_path = input.filepath
|
|
||||||
|
|
||||||
# Guess the content type from any file extension that might be around
|
# Guess the content type from any file extension that might be around
|
||||||
content_type, _ = mimetypes.guess_type(
|
content_type, _ = mimetypes.guess_type(
|
||||||
@@ -37,7 +35,8 @@ class PlainTextConverter(DocumentConverter):
|
|||||||
):
|
):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
text_content = str(from_path(local_path).best())
|
text_content = str(from_bytes(file_obj.read()).best())
|
||||||
|
file_obj.close()
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
text_content=text_content,
|
text_content=text_content,
|
||||||
|
|||||||
@@ -22,15 +22,15 @@ class RssConverter(DocumentConverter):
|
|||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
if extension.lower() not in [".xml", ".rss", ".atom"]:
|
||||||
return None
|
return None
|
||||||
# Bail if a local path is not provided
|
# Read file object from input
|
||||||
if input.input_type != "filepath":
|
file_obj = input.read_file(mode="rb")
|
||||||
return None
|
|
||||||
local_path = input.filepath
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
doc = minidom.parse(local_path)
|
doc = minidom.parse(file_obj)
|
||||||
except BaseException as _:
|
except BaseException as _:
|
||||||
return None
|
return None
|
||||||
|
file_obj.close()
|
||||||
|
|
||||||
result = None
|
result = None
|
||||||
if doc.getElementsByTagName("rss"):
|
if doc.getElementsByTagName("rss"):
|
||||||
# A RSS feed must have a root element of <rss>
|
# A RSS feed must have a root element of <rss>
|
||||||
|
|||||||
Reference in New Issue
Block a user