18 Commits

Author SHA1 Message Date
Kenny Zhang
4e0a10ecf3 ran unit tests locally 2025-02-27 16:44:50 -05:00
Kenny Zhang
950b135da6 formatting 2025-02-27 15:08:10 -05:00
Kenny Zhang
b671345bb9 updated readme 2025-02-27 15:07:46 -05:00
Kenny Zhang
d9a92f7f06 added file obj unit tests for rss and json 2025-02-27 15:05:29 -05:00
Kenny Zhang
db0c8acbaf added file obj support to rss and plain text converters 2025-02-27 14:55:49 -05:00
Kenny Zhang
08330c2ac3 added core unit tests for file obj support 2025-02-27 11:27:05 -05:00
Kenny Zhang
4afc1fe886 added non-binary example to README 2025-02-21 13:31:37 -05:00
Kenny Zhang
b0044720da updated docs 2025-02-20 16:56:47 -05:00
Kenny Zhang
07a28d4f00 black formatting 2025-02-20 16:49:37 -05:00
Kenny Zhang
b8b3897952 modify ext guesser 2025-02-20 16:47:37 -05:00
Kenny Zhang
395ce2d301 close file object after using 2025-02-20 13:54:51 -05:00
Kenny Zhang
808401a331 added conversion path for file object in central class 2025-02-19 17:02:51 -05:00
Kenny Zhang
e75f3f6f5b local path inputs to MarkitDown class adhere to new converterinput structure 2025-02-19 15:16:45 -05:00
Kenny Zhang
8e950325d2 refactored remaining converters 2025-02-19 14:01:43 -05:00
Kenny Zhang
096fef3d5f refactored more converters to support input class 2025-02-19 13:34:28 -05:00
Kenny Zhang
52cbff061a begin refactoring converter classes 2025-02-19 11:48:00 -05:00
Kenny Zhang
0027e6d425 added wrapper class for converter file input 2025-02-18 12:44:18 -05:00
Kenny Zhang
63a7bafadd removed redundant priority setting 2025-02-18 12:18:49 -05:00
22 changed files with 361 additions and 94 deletions

View File

@@ -97,6 +97,25 @@ result = md.convert("test.pdf")
print(result.text_content)
```
MarkItDown also supports converting file objects directly:
```python
from markitdown import MarkItDown
md = MarkItDown()
# Providing the file extension when converting via file objects is recommended for most consistent results
# Binary Mode
with open("test.docx", 'rb') as file:
result = md.convert(file, file_extension=".docx")
print(result.text_content)
# Non-Binary Mode
with open("sample.ipynb", 'rt', encoding="utf-8") as file:
result = md.convert(file, file_extension=".ipynb")
print(result.text_content)
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python

View File

@@ -10,6 +10,7 @@ from typing import Any, List, Optional, Union
from pathlib import Path
from urllib.parse import urlparse
from warnings import warn
from io import BufferedIOBase, TextIOBase, BytesIO
# File-format detection
import puremagic
@@ -36,6 +37,7 @@ from .converters import (
OutlookMsgConverter,
ZipConverter,
DocumentIntelligenceConverter,
ConverterInput,
)
from ._exceptions import (
@@ -173,14 +175,15 @@ class MarkItDown:
warn("Plugins converters are already enabled.", RuntimeWarning)
def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any
self,
source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
**kwargs: Any,
) -> DocumentConverterResult: # TODO: deal with kwargs
"""
Args:
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
- source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
"""
# Local path or url
if isinstance(source, str):
if (
@@ -196,6 +199,9 @@ class MarkItDown:
return self.convert_response(source, **kwargs)
elif isinstance(source, Path):
return self.convert_local(source, **kwargs)
# File object
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
return self.convert_file_object(source, **kwargs)
def convert_local(
self, path: Union[str, Path], **kwargs: Any
@@ -210,11 +216,33 @@ class MarkItDown:
base, ext = os.path.splitext(path)
self._append_ext(extensions, ext)
for g in self._guess_ext_magic(path):
for g in self._guess_ext_magic(source=path):
self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="filepath", filepath=path)
# Convert
return self._convert(path, extensions, **kwargs)
return self._convert(input, extensions, **kwargs)
def convert_file_object(
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs
# Prepare a list of extensions to try (in order of priority
ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else []
# TODO: Curently, there are some ongoing issues with passing direct file objects to puremagic (incorrect guesses, unsupported file type errors, etc.)
# Only use puremagic as a last resort if no extensions were provided
if extensions == []:
for g in self._guess_ext_magic(source=file_object):
self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="object", file_object=file_object)
# Convert
return self._convert(input, extensions, **kwargs)
# TODO what should stream's type be?
def convert_stream(
@@ -238,11 +266,14 @@ class MarkItDown:
fh.close()
# Use puremagic to check for more extension options
for g in self._guess_ext_magic(temp_path):
for g in self._guess_ext_magic(source=temp_path):
self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="filepath", filepath=temp_path)
# Convert
result = self._convert(temp_path, extensions, **kwargs)
result = self._convert(input, extensions, **kwargs)
# Clean up
finally:
try:
@@ -294,11 +325,14 @@ class MarkItDown:
fh.close()
# Use puremagic to check for more extension options
for g in self._guess_ext_magic(temp_path):
for g in self._guess_ext_magic(source=temp_path):
self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="filepath", filepath=temp_path)
# Convert
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
result = self._convert(input, extensions, url=response.url, **kwargs)
# Clean up
finally:
try:
@@ -310,10 +344,9 @@ class MarkItDown:
return result
def _convert(
self, local_path: str, extensions: List[Union[str, None]], **kwargs
self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
) -> DocumentConverterResult:
error_trace = ""
# Create a copy of the page_converters list, sorted by priority.
# We do this with each call to _convert because the priority of converters may change between calls.
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
@@ -348,7 +381,7 @@ class MarkItDown:
# If we hit an error log it and keep trying
try:
res = converter.convert(local_path, **_kwargs)
res = converter.convert(input, **_kwargs)
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()
@@ -365,12 +398,12 @@ class MarkItDown:
# If we got this far without success, report any exceptions
if len(error_trace) > 0:
raise FileConversionException(
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
f"Could not convert '{input.filepath}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
)
# Nothing can handle it!
raise UnsupportedFormatException(
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
f"Could not convert '{input.filepath}' to Markdown. The formats {extensions} are not supported."
)
def _append_ext(self, extensions, ext):
@@ -383,29 +416,38 @@ class MarkItDown:
# if ext not in extensions:
extensions.append(ext)
def _guess_ext_magic(self, path):
def _guess_ext_magic(self, source):
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
# Use puremagic to guess
try:
guesses = puremagic.magic_file(path)
guesses = []
# Fix for: https://github.com/microsoft/markitdown/issues/222
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
# (space, tab, newline, carriage return, vertical tab, form feed).
if len(guesses) == 0:
with open(path, "rb") as file:
while True:
char = file.read(1)
if not char: # End of file
break
if not char.isspace():
file.seek(file.tell() - 1)
break
try:
guesses = puremagic.magic_stream(file)
except puremagic.main.PureError:
pass
# Guess extensions for filepaths
if isinstance(source, str):
guesses = puremagic.magic_file(source)
# Fix for: https://github.com/microsoft/markitdown/issues/222
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
# (space, tab, newline, carriage return, vertical tab, form feed).
if len(guesses) == 0:
with open(source, "rb") as file:
while True:
char = file.read(1)
if not char: # End of file
break
if not char.isspace():
file.seek(file.tell() - 1)
break
try:
guesses = puremagic.magic_stream(file)
except puremagic.main.PureError:
pass
# Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
# TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work)
elif isinstance(source, BufferedIOBase):
guesses = puremagic.magic_stream(source)
extensions = list()
for g in guesses:

View File

@@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter
from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter
from ._converter_input import ConverterInput
__all__ = [
"DocumentConverter",
@@ -42,4 +43,5 @@ __all__ = [
"OutlookMsgConverter",
"ZipConverter",
"DocumentIntelligenceConverter",
"ConverterInput",
]

View File

@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class BingSerpConverter(DocumentConverter):
@@ -21,7 +22,9 @@ class BingSerpConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]:
@@ -36,8 +39,9 @@ class BingSerpConverter(DocumentConverter):
# Parse the file
soup = None
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Clean up some formatting
for tptt in soup.find_all(class_="tptt"):

View File

@@ -0,0 +1,30 @@
from typing import Any, Union
class ConverterInput:
"""
Wrapper for inputs to converter functions.
"""
def __init__(
self,
input_type: str = "filepath",
filepath: Union[str, None] = None,
file_object: Union[Any, None] = None,
):
if input_type not in ["filepath", "object"]:
raise ValueError(f"Invalid converter input type: {input_type}")
self.input_type = input_type
self.filepath = filepath
self.file_object = file_object
def read_file(
self,
mode: str = "rb",
encoding: Union[str, None] = None,
) -> Any:
if self.input_type == "object":
return self.file_object
return open(self.filepath, mode=mode, encoding=encoding)

View File

@@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import (
from azure.identity import DefaultAzureCredential
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@@ -37,10 +38,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
api_version=self.api_version,
credential=DefaultAzureCredential(),
)
self._priority = priority
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "")
@@ -60,9 +60,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
if extension.lower() not in docintel_extensions:
return None
# Get the bytestring for the local path
with open(local_path, "rb") as f:
file_bytes = f.read()
# Get the bytestring from the converter input
file_obj = input.read_file(mode="rb")
file_bytes = file_obj.read()
file_obj.close()
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:

View File

@@ -8,6 +8,7 @@ from ._base import (
from ._base import DocumentConverter
from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class DocxConverter(HtmlConverter):
@@ -20,18 +21,20 @@ class DocxConverter(HtmlConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx":
return None
result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)
style_map = kwargs.get("style_map", None)
file_obj = input.read_file(mode="rb")
result = mammoth.convert_to_html(file_obj, style_map=style_map)
file_obj.close()
html_content = result.value
result = self._convert(html_content)
return result

View File

@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class HtmlConverter(DocumentConverter):
@@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not html
extension = kwargs.get("file_extension", "")
@@ -22,8 +23,9 @@ class HtmlConverter(DocumentConverter):
return None
result = None
with open(local_path, "rt", encoding="utf-8") as fh:
result = self._convert(fh.read())
file_obj = input.read_file(mode="rt", encoding="utf-8")
result = self._convert(file_obj.read())
file_obj.close()
return result

View File

@@ -1,6 +1,7 @@
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
from ._converter_input import ConverterInput
class ImageConverter(MediaConverter):
@@ -13,7 +14,9 @@ class ImageConverter(MediaConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
@@ -21,8 +24,9 @@ class ImageConverter(MediaConverter):
md_content = ""
# Add metadata
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
# Add metadata if a local path is provided
if input.input_type == "filepath":
metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path"))
if metadata:
for f in [
@@ -47,7 +51,7 @@ class ImageConverter(MediaConverter):
md_content += (
"\n# Description:\n"
+ self._get_llm_description(
local_path,
input,
extension,
llm_client,
llm_model,
@@ -61,17 +65,20 @@ class ImageConverter(MediaConverter):
text_content=md_content,
)
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
def _get_llm_description(
self, input: ConverterInput, extension, client, model, prompt=None
):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
data_uri = ""
with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None:
content_type = "image/jpeg"
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
if content_type is None:
content_type = "image/jpeg"
image_file = input.read_file(mode="rb")
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
image_file.close()
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [
{

View File

@@ -7,6 +7,7 @@ from ._base import (
)
from .._exceptions import FileConversionException
from ._converter_input import ConverterInput
class IpynbConverter(DocumentConverter):
@@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not ipynb
extension = kwargs.get("file_extension", "")
@@ -27,9 +28,10 @@ class IpynbConverter(DocumentConverter):
# Parse and convert the notebook
result = None
with open(local_path, "rt", encoding="utf-8") as fh:
notebook_content = json.load(fh)
result = self._convert(notebook_content)
file_obj = input.read_file(mode="rt", encoding="utf-8")
notebook_content = json.load(file_obj)
file_obj.close()
result = self._convert(notebook_content)
return result

View File

@@ -1,8 +1,10 @@
import tempfile
import os
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings
from ._converter_input import ConverterInput
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -33,12 +35,19 @@ class Mp3Converter(WavConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3
extension = kwargs.get("file_extension", "")
if extension.lower() != ".mp3":
return None
# Bail if a local path was not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
md_content = ""
# Add metadata

View File

@@ -1,6 +1,7 @@
import olefile
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class OutlookMsgConverter(DocumentConverter):
@@ -17,7 +18,7 @@ class OutlookMsgConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a MSG file
extension = kwargs.get("file_extension", "")
@@ -25,7 +26,9 @@ class OutlookMsgConverter(DocumentConverter):
return None
try:
msg = olefile.OleFileIO(local_path)
file_obj = input.read_file(mode="rb")
msg = olefile.OleFileIO(file_obj)
# Extract email metadata
md_content = "# Email Message\n\n"
@@ -49,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter):
md_content += body
msg.close()
file_obj.close()
return DocumentConverterResult(
title=headers.get("Subject"), text_content=md_content.strip()
@@ -56,7 +60,7 @@ class OutlookMsgConverter(DocumentConverter):
except Exception as e:
raise FileConversionException(
f"Could not convert MSG file '{local_path}': {str(e)}"
f"Could not convert MSG file '{input.filepath}': {str(e)}"
)
def _get_stream_data(

View File

@@ -1,7 +1,9 @@
import pdfminer
import pdfminer.high_level
from typing import Union
from io import StringIO
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class PdfConverter(DocumentConverter):
@@ -14,13 +16,20 @@ class PdfConverter(DocumentConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf":
return None
output = StringIO()
file_obj = input.read_file(mode="rb")
pdfminer.high_level.extract_text_to_fp(file_obj, output)
file_obj.close()
return DocumentConverterResult(
title=None,
text_content=pdfminer.high_level.extract_text(local_path),
text_content=output.getvalue(),
)

View File

@@ -1,9 +1,10 @@
import mimetypes
from charset_normalizer import from_path
from charset_normalizer import from_path, from_bytes
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class PlainTextConverter(DocumentConverter):
@@ -15,8 +16,11 @@ class PlainTextConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Read file object from input
file_obj = input.read_file(mode="rb")
# Guess the content type from any file extension that might be around
content_type, _ = mimetypes.guess_type(
"__placeholder" + kwargs.get("file_extension", "")
@@ -31,7 +35,8 @@ class PlainTextConverter(DocumentConverter):
):
return None
text_content = str(from_path(local_path).best())
text_content = str(from_bytes(file_obj.read()).best())
file_obj.close()
return DocumentConverterResult(
title=None,
text_content=text_content,

View File

@@ -7,6 +7,7 @@ from typing import Union
from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class PptxConverter(HtmlConverter):
@@ -48,7 +49,9 @@ class PptxConverter(HtmlConverter):
)
return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pptx":
@@ -56,7 +59,10 @@ class PptxConverter(HtmlConverter):
md_content = ""
presentation = pptx.Presentation(local_path)
file_obj = input.read_file(mode="rb")
presentation = pptx.Presentation(file_obj)
file_obj.close()
slide_num = 0
for slide in presentation.slides:
slide_num += 1

View File

@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class RssConverter(DocumentConverter):
@@ -15,16 +16,21 @@ class RssConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not RSS type
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]:
return None
# Read file object from input
file_obj = input.read_file(mode="rb")
try:
doc = minidom.parse(local_path)
doc = minidom.parse(file_obj)
except BaseException as _:
return None
file_obj.close()
result = None
if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss>

View File

@@ -1,6 +1,7 @@
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
from ._converter_input import ConverterInput
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -22,12 +23,19 @@ class WavConverter(MediaConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV
extension = kwargs.get("file_extension", "")
if extension.lower() != ".wav":
return None
# Bail if a local path was not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
md_content = ""
# Add metadata

View File

@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class WikipediaConverter(DocumentConverter):
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not Wikipedia
extension = kwargs.get("file_extension", "")
@@ -28,8 +29,9 @@ class WikipediaConverter(DocumentConverter):
# Parse the file
soup = None
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Remove javascript and style blocks
for script in soup(["script", "style"]):

View File

@@ -4,6 +4,7 @@ import pandas as pd
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class XlsxConverter(HtmlConverter):
@@ -16,13 +17,18 @@ class XlsxConverter(HtmlConverter):
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".xlsx":
return None
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
file_obj.close()
md_content = ""
for s in sheets:
md_content += f"## {s}\n"
@@ -40,13 +46,18 @@ class XlsConverter(HtmlConverter):
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
"""
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a XLS
extension = kwargs.get("file_extension", "")
if extension.lower() != ".xls":
return None
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
file_obj.close()
md_content = ""
for s in sheets:
md_content += f"## {s}\n"

View File

@@ -1,10 +1,12 @@
import re
import json
from typing import Any, Union, Dict, List
from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
# Optional YouTube transcription support
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not YouTube
extension = kwargs.get("file_extension", "")
@@ -37,8 +39,9 @@ class YouTubeConverter(DocumentConverter):
# Parse the file
soup = None
with open(local_path, "rt", encoding="utf-8") as fh:
soup = BeautifulSoup(fh.read(), "html.parser")
file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Read the meta tags
assert soup.title is not None and soup.title.string is not None

View File

@@ -4,6 +4,7 @@ import shutil
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class ZipConverter(DocumentConverter):
@@ -51,13 +52,18 @@ class ZipConverter(DocumentConverter):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
return None
# Bail if a local path is not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
# Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters:
@@ -111,7 +117,11 @@ class ZipConverter(DocumentConverter):
if isinstance(converter, ZipConverter):
continue
result = converter.convert(file_path, **file_kwargs)
# Create a ConverterInput for the parent converter and attempt conversion
input = ConverterInput(
input_type="filepath", filepath=file_path
)
result = converter.convert(input, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"

View File

@@ -189,7 +189,7 @@ def test_markitdown_remote() -> None:
# assert test_string in result.text_content
def test_markitdown_local() -> None:
def test_markitdown_local_paths() -> None:
markitdown = MarkItDown()
# Test XLSX processing
@@ -272,6 +272,87 @@ def test_markitdown_local() -> None:
assert "# Test" in result.text_content
def test_markitdown_local_objects() -> None:
markitdown = MarkItDown()
# Test XLSX processing
with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f:
result = markitdown.convert(f, file_extension=".xlsx")
validate_strings(result, XLSX_TEST_STRINGS)
# Test XLS processing
with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f:
result = markitdown.convert(f, file_extension=".xls")
for test_string in XLS_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing
with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f:
result = markitdown.convert(f, file_extension=".docx")
validate_strings(result, DOCX_TEST_STRINGS)
# Test DOCX processing, with comments
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
result = markitdown.convert(
f,
file_extension=".docx",
style_map="comment-reference => ",
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
result = markitdown_with_style_map.convert(f, file_extension=".docx")
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test PPTX processing
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f:
result = markitdown.convert(f, file_extension=".pptx")
validate_strings(result, PPTX_TEST_STRINGS)
# Test HTML processing
with open(
os.path.join(TEST_FILES_DIR, "test_blog.html"), "rt", encoding="utf-8"
) as f:
result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL)
validate_strings(result, BLOG_TEST_STRINGS)
# Test Wikipedia processing
with open(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rt", encoding="utf-8"
) as f:
result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL)
text_content = result.text_content.replace("\\", "")
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
# Test Bing processing
with open(
os.path.join(TEST_FILES_DIR, "test_serp.html"), "rt", encoding="utf-8"
) as f:
result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL)
text_content = result.text_content.replace("\\", "")
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
# Test RSS processing
with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f:
result = markitdown.convert(f, file_extension=".xml")
text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS:
assert test_string in text_content
# Test MSG (Outlook email) processing
with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f:
result = markitdown.convert(f, file_extension=".msg")
validate_strings(result, MSG_TEST_STRINGS)
# Test JSON processing
with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f:
result = markitdown.convert(f, file_extension=".json")
validate_strings(result, JSON_TEST_STRINGS)
@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
@@ -328,7 +409,8 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
test_markitdown_local_paths()
test_markitdown_local_objects()
test_markitdown_exiftool()
# test_markitdown_llm()
print("All tests passed!")