18 Commits

Author SHA1 Message Date
Kenny Zhang
4e0a10ecf3 ran unit tests locally 2025-02-27 16:44:50 -05:00
Kenny Zhang
950b135da6 formatting 2025-02-27 15:08:10 -05:00
Kenny Zhang
b671345bb9 updated readme 2025-02-27 15:07:46 -05:00
Kenny Zhang
d9a92f7f06 added file obj unit tests for rss and json 2025-02-27 15:05:29 -05:00
Kenny Zhang
db0c8acbaf added file obj support to rss and plain text converters 2025-02-27 14:55:49 -05:00
Kenny Zhang
08330c2ac3 added core unit tests for file obj support 2025-02-27 11:27:05 -05:00
Kenny Zhang
4afc1fe886 added non-binary example to README 2025-02-21 13:31:37 -05:00
Kenny Zhang
b0044720da updated docs 2025-02-20 16:56:47 -05:00
Kenny Zhang
07a28d4f00 black formatting 2025-02-20 16:49:37 -05:00
Kenny Zhang
b8b3897952 modify ext guesser 2025-02-20 16:47:37 -05:00
Kenny Zhang
395ce2d301 close file object after using 2025-02-20 13:54:51 -05:00
Kenny Zhang
808401a331 added conversion path for file object in central class 2025-02-19 17:02:51 -05:00
Kenny Zhang
e75f3f6f5b local path inputs to MarkitDown class adhere to new converterinput structure 2025-02-19 15:16:45 -05:00
Kenny Zhang
8e950325d2 refactored remaining converters 2025-02-19 14:01:43 -05:00
Kenny Zhang
096fef3d5f refactored more converters to support input class 2025-02-19 13:34:28 -05:00
Kenny Zhang
52cbff061a begin refactoring converter classes 2025-02-19 11:48:00 -05:00
Kenny Zhang
0027e6d425 added wrapper class for converter file input 2025-02-18 12:44:18 -05:00
Kenny Zhang
63a7bafadd removed redundant priority setting 2025-02-18 12:18:49 -05:00
22 changed files with 361 additions and 94 deletions

View File

@@ -97,6 +97,25 @@ result = md.convert("test.pdf")
print(result.text_content) print(result.text_content)
``` ```
MarkItDown also supports converting file objects directly:
```python
from markitdown import MarkItDown
md = MarkItDown()
# Providing the file extension when converting via file objects is recommended for most consistent results
# Binary Mode
with open("test.docx", 'rb') as file:
result = md.convert(file, file_extension=".docx")
print(result.text_content)
# Non-Binary Mode
with open("sample.ipynb", 'rt', encoding="utf-8") as file:
result = md.convert(file, file_extension=".ipynb")
print(result.text_content)
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python ```python

View File

@@ -10,6 +10,7 @@ from typing import Any, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse from urllib.parse import urlparse
from warnings import warn from warnings import warn
from io import BufferedIOBase, TextIOBase, BytesIO
# File-format detection # File-format detection
import puremagic import puremagic
@@ -36,6 +37,7 @@ from .converters import (
OutlookMsgConverter, OutlookMsgConverter,
ZipConverter, ZipConverter,
DocumentIntelligenceConverter, DocumentIntelligenceConverter,
ConverterInput,
) )
from ._exceptions import ( from ._exceptions import (
@@ -173,14 +175,15 @@ class MarkItDown:
warn("Plugins converters are already enabled.", RuntimeWarning) warn("Plugins converters are already enabled.", RuntimeWarning)
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self,
source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
**kwargs: Any,
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
""" """
Args: Args:
- source: can be a string representing a path either as string pathlib path object or url, or a requests.response object - source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
""" """
# Local path or url # Local path or url
if isinstance(source, str): if isinstance(source, str):
if ( if (
@@ -196,6 +199,9 @@ class MarkItDown:
return self.convert_response(source, **kwargs) return self.convert_response(source, **kwargs)
elif isinstance(source, Path): elif isinstance(source, Path):
return self.convert_local(source, **kwargs) return self.convert_local(source, **kwargs)
# File object
elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
return self.convert_file_object(source, **kwargs)
def convert_local( def convert_local(
self, path: Union[str, Path], **kwargs: Any self, path: Union[str, Path], **kwargs: Any
@@ -210,11 +216,33 @@ class MarkItDown:
base, ext = os.path.splitext(path) base, ext = os.path.splitext(path)
self._append_ext(extensions, ext) self._append_ext(extensions, ext)
for g in self._guess_ext_magic(path): for g in self._guess_ext_magic(source=path):
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="filepath", filepath=path)
# Convert # Convert
return self._convert(path, extensions, **kwargs) return self._convert(input, extensions, **kwargs)
def convert_file_object(
self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs
# Prepare a list of extensions to try (in order of priority
ext = kwargs.get("file_extension")
extensions = [ext] if ext is not None else []
# TODO: Curently, there are some ongoing issues with passing direct file objects to puremagic (incorrect guesses, unsupported file type errors, etc.)
# Only use puremagic as a last resort if no extensions were provided
if extensions == []:
for g in self._guess_ext_magic(source=file_object):
self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="object", file_object=file_object)
# Convert
return self._convert(input, extensions, **kwargs)
# TODO what should stream's type be? # TODO what should stream's type be?
def convert_stream( def convert_stream(
@@ -238,11 +266,14 @@ class MarkItDown:
fh.close() fh.close()
# Use puremagic to check for more extension options # Use puremagic to check for more extension options
for g in self._guess_ext_magic(temp_path): for g in self._guess_ext_magic(source=temp_path):
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="filepath", filepath=temp_path)
# Convert # Convert
result = self._convert(temp_path, extensions, **kwargs) result = self._convert(input, extensions, **kwargs)
# Clean up # Clean up
finally: finally:
try: try:
@@ -294,11 +325,14 @@ class MarkItDown:
fh.close() fh.close()
# Use puremagic to check for more extension options # Use puremagic to check for more extension options
for g in self._guess_ext_magic(temp_path): for g in self._guess_ext_magic(source=temp_path):
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Create the ConverterInput object
input = ConverterInput(input_type="filepath", filepath=temp_path)
# Convert # Convert
result = self._convert(temp_path, extensions, url=response.url, **kwargs) result = self._convert(input, extensions, url=response.url, **kwargs)
# Clean up # Clean up
finally: finally:
try: try:
@@ -310,10 +344,9 @@ class MarkItDown:
return result return result
def _convert( def _convert(
self, local_path: str, extensions: List[Union[str, None]], **kwargs self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
) -> DocumentConverterResult: ) -> DocumentConverterResult:
error_trace = "" error_trace = ""
# Create a copy of the page_converters list, sorted by priority. # Create a copy of the page_converters list, sorted by priority.
# We do this with each call to _convert because the priority of converters may change between calls. # We do this with each call to _convert because the priority of converters may change between calls.
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
@@ -348,7 +381,7 @@ class MarkItDown:
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
try: try:
res = converter.convert(local_path, **_kwargs) res = converter.convert(input, **_kwargs)
except Exception: except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip() error_trace = ("\n\n" + traceback.format_exc()).strip()
@@ -365,12 +398,12 @@ class MarkItDown:
# If we got this far without success, report any exceptions # If we got this far without success, report any exceptions
if len(error_trace) > 0: if len(error_trace) > 0:
raise FileConversionException( raise FileConversionException(
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}" f"Could not convert '{input.filepath}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
) )
# Nothing can handle it! # Nothing can handle it!
raise UnsupportedFormatException( raise UnsupportedFormatException(
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported." f"Could not convert '{input.filepath}' to Markdown. The formats {extensions} are not supported."
) )
def _append_ext(self, extensions, ext): def _append_ext(self, extensions, ext):
@@ -383,29 +416,38 @@ class MarkItDown:
# if ext not in extensions: # if ext not in extensions:
extensions.append(ext) extensions.append(ext)
def _guess_ext_magic(self, path): def _guess_ext_magic(self, source):
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
# Use puremagic to guess # Use puremagic to guess
try: try:
guesses = puremagic.magic_file(path) guesses = []
# Fix for: https://github.com/microsoft/markitdown/issues/222 # Guess extensions for filepaths
# If there are no guesses, then try again after trimming leading ASCII whitespaces. if isinstance(source, str):
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' guesses = puremagic.magic_file(source)
# (space, tab, newline, carriage return, vertical tab, form feed).
if len(guesses) == 0: # Fix for: https://github.com/microsoft/markitdown/issues/222
with open(path, "rb") as file: # If there are no guesses, then try again after trimming leading ASCII whitespaces.
while True: # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
char = file.read(1) # (space, tab, newline, carriage return, vertical tab, form feed).
if not char: # End of file if len(guesses) == 0:
break with open(source, "rb") as file:
if not char.isspace(): while True:
file.seek(file.tell() - 1) char = file.read(1)
break if not char: # End of file
try: break
guesses = puremagic.magic_stream(file) if not char.isspace():
except puremagic.main.PureError: file.seek(file.tell() - 1)
pass break
try:
guesses = puremagic.magic_stream(file)
except puremagic.main.PureError:
pass
# Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
# TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work)
elif isinstance(source, BufferedIOBase):
guesses = puremagic.magic_stream(source)
extensions = list() extensions = list()
for g in guesses: for g in guesses:

View File

@@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter
from ._outlook_msg_converter import OutlookMsgConverter from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import DocumentIntelligenceConverter
from ._converter_input import ConverterInput
__all__ = [ __all__ = [
"DocumentConverter", "DocumentConverter",
@@ -42,4 +43,5 @@ __all__ = [
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",
"ConverterInput",
] ]

View File

@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class BingSerpConverter(DocumentConverter): class BingSerpConverter(DocumentConverter):
@@ -21,7 +22,9 @@ class BingSerpConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP # Bail if not a Bing SERP
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]: if extension.lower() not in [".html", ".htm"]:
@@ -36,8 +39,9 @@ class BingSerpConverter(DocumentConverter):
# Parse the file # Parse the file
soup = None soup = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(fh.read(), "html.parser") soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Clean up some formatting # Clean up some formatting
for tptt in soup.find_all(class_="tptt"): for tptt in soup.find_all(class_="tptt"):

View File

@@ -0,0 +1,30 @@
from typing import Any, Union
class ConverterInput:
"""
Wrapper for inputs to converter functions.
"""
def __init__(
self,
input_type: str = "filepath",
filepath: Union[str, None] = None,
file_object: Union[Any, None] = None,
):
if input_type not in ["filepath", "object"]:
raise ValueError(f"Invalid converter input type: {input_type}")
self.input_type = input_type
self.filepath = filepath
self.file_object = file_object
def read_file(
self,
mode: str = "rb",
encoding: Union[str, None] = None,
) -> Any:
if self.input_type == "object":
return self.file_object
return open(self.filepath, mode=mode, encoding=encoding)

View File

@@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import (
from azure.identity import DefaultAzureCredential from azure.identity import DefaultAzureCredential
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@@ -37,10 +38,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
api_version=self.api_version, api_version=self.api_version,
credential=DefaultAzureCredential(), credential=DefaultAzureCredential(),
) )
self._priority = priority
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence # Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -60,9 +60,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
if extension.lower() not in docintel_extensions: if extension.lower() not in docintel_extensions:
return None return None
# Get the bytestring for the local path # Get the bytestring from the converter input
with open(local_path, "rb") as f: file_obj = input.read_file(mode="rb")
file_bytes = f.read() file_bytes = file_obj.read()
file_obj.close()
# Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:

View File

@@ -8,6 +8,7 @@ from ._base import (
from ._base import DocumentConverter from ._base import DocumentConverter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class DocxConverter(HtmlConverter): class DocxConverter(HtmlConverter):
@@ -20,18 +21,20 @@ class DocxConverter(HtmlConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX # Bail if not a DOCX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx": if extension.lower() != ".docx":
return None return None
result = None result = None
with open(local_path, "rb") as docx_file: style_map = kwargs.get("style_map", None)
style_map = kwargs.get("style_map", None) file_obj = input.read_file(mode="rb")
result = mammoth.convert_to_html(file_obj, style_map=style_map)
result = mammoth.convert_to_html(docx_file, style_map=style_map) file_obj.close()
html_content = result.value html_content = result.value
result = self._convert(html_content) result = self._convert(html_content)
return result return result

View File

@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
@@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not html # Bail if not html
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -22,8 +23,9 @@ class HtmlConverter(DocumentConverter):
return None return None
result = None result = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
result = self._convert(fh.read()) result = self._convert(file_obj.read())
file_obj.close()
return result return result

View File

@@ -1,6 +1,7 @@
from typing import Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
from ._converter_input import ConverterInput
class ImageConverter(MediaConverter): class ImageConverter(MediaConverter):
@@ -13,7 +14,9 @@ class ImageConverter(MediaConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not an image # Bail if not an image
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]: if extension.lower() not in [".jpg", ".jpeg", ".png"]:
@@ -21,8 +24,9 @@ class ImageConverter(MediaConverter):
md_content = "" md_content = ""
# Add metadata # Add metadata if a local path is provided
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path")) if input.input_type == "filepath":
metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path"))
if metadata: if metadata:
for f in [ for f in [
@@ -47,7 +51,7 @@ class ImageConverter(MediaConverter):
md_content += ( md_content += (
"\n# Description:\n" "\n# Description:\n"
+ self._get_llm_description( + self._get_llm_description(
local_path, input,
extension, extension,
llm_client, llm_client,
llm_model, llm_model,
@@ -61,17 +65,20 @@ class ImageConverter(MediaConverter):
text_content=md_content, text_content=md_content,
) )
def _get_llm_description(self, local_path, extension, client, model, prompt=None): def _get_llm_description(
self, input: ConverterInput, extension, client, model, prompt=None
):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."
data_uri = "" data_uri = ""
with open(local_path, "rb") as image_file: content_type, encoding = mimetypes.guess_type("_dummy" + extension)
content_type, encoding = mimetypes.guess_type("_dummy" + extension) if content_type is None:
if content_type is None: content_type = "image/jpeg"
content_type = "image/jpeg" image_file = input.read_file(mode="rb")
image_base64 = base64.b64encode(image_file.read()).decode("utf-8") image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}" image_file.close()
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [ messages = [
{ {

View File

@@ -7,6 +7,7 @@ from ._base import (
) )
from .._exceptions import FileConversionException from .._exceptions import FileConversionException
from ._converter_input import ConverterInput
class IpynbConverter(DocumentConverter): class IpynbConverter(DocumentConverter):
@@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not ipynb # Bail if not ipynb
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -27,9 +28,10 @@ class IpynbConverter(DocumentConverter):
# Parse and convert the notebook # Parse and convert the notebook
result = None result = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
notebook_content = json.load(fh) notebook_content = json.load(file_obj)
result = self._convert(notebook_content) file_obj.close()
result = self._convert(notebook_content)
return result return result

View File

@@ -1,8 +1,10 @@
import tempfile import tempfile
import os
from typing import Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings from warnings import resetwarnings, catch_warnings
from ._converter_input import ConverterInput
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -33,12 +35,19 @@ class Mp3Converter(WavConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3 # Bail if not a MP3
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".mp3": if extension.lower() != ".mp3":
return None return None
# Bail if a local path was not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
md_content = "" md_content = ""
# Add metadata # Add metadata

View File

@@ -1,6 +1,7 @@
import olefile import olefile
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class OutlookMsgConverter(DocumentConverter): class OutlookMsgConverter(DocumentConverter):
@@ -17,7 +18,7 @@ class OutlookMsgConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not a MSG file # Bail if not a MSG file
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -25,7 +26,9 @@ class OutlookMsgConverter(DocumentConverter):
return None return None
try: try:
msg = olefile.OleFileIO(local_path) file_obj = input.read_file(mode="rb")
msg = olefile.OleFileIO(file_obj)
# Extract email metadata # Extract email metadata
md_content = "# Email Message\n\n" md_content = "# Email Message\n\n"
@@ -49,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter):
md_content += body md_content += body
msg.close() msg.close()
file_obj.close()
return DocumentConverterResult( return DocumentConverterResult(
title=headers.get("Subject"), text_content=md_content.strip() title=headers.get("Subject"), text_content=md_content.strip()
@@ -56,7 +60,7 @@ class OutlookMsgConverter(DocumentConverter):
except Exception as e: except Exception as e:
raise FileConversionException( raise FileConversionException(
f"Could not convert MSG file '{local_path}': {str(e)}" f"Could not convert MSG file '{input.filepath}': {str(e)}"
) )
def _get_stream_data( def _get_stream_data(

View File

@@ -1,7 +1,9 @@
import pdfminer import pdfminer
import pdfminer.high_level import pdfminer.high_level
from typing import Union from typing import Union
from io import StringIO
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class PdfConverter(DocumentConverter): class PdfConverter(DocumentConverter):
@@ -14,13 +16,20 @@ class PdfConverter(DocumentConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF # Bail if not a PDF
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf": if extension.lower() != ".pdf":
return None return None
output = StringIO()
file_obj = input.read_file(mode="rb")
pdfminer.high_level.extract_text_to_fp(file_obj, output)
file_obj.close()
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=pdfminer.high_level.extract_text(local_path), text_content=output.getvalue(),
) )

View File

@@ -1,9 +1,10 @@
import mimetypes import mimetypes
from charset_normalizer import from_path from charset_normalizer import from_path, from_bytes
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
@@ -15,8 +16,11 @@ class PlainTextConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Read file object from input
file_obj = input.read_file(mode="rb")
# Guess the content type from any file extension that might be around # Guess the content type from any file extension that might be around
content_type, _ = mimetypes.guess_type( content_type, _ = mimetypes.guess_type(
"__placeholder" + kwargs.get("file_extension", "") "__placeholder" + kwargs.get("file_extension", "")
@@ -31,7 +35,8 @@ class PlainTextConverter(DocumentConverter):
): ):
return None return None
text_content = str(from_path(local_path).best()) text_content = str(from_bytes(file_obj.read()).best())
file_obj.close()
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=text_content, text_content=text_content,

View File

@@ -7,6 +7,7 @@ from typing import Union
from ._base import DocumentConverterResult, DocumentConverter from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class PptxConverter(HtmlConverter): class PptxConverter(HtmlConverter):
@@ -48,7 +49,9 @@ class PptxConverter(HtmlConverter):
) )
return response.choices[0].message.content return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX # Bail if not a PPTX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".pptx": if extension.lower() != ".pptx":
@@ -56,7 +59,10 @@ class PptxConverter(HtmlConverter):
md_content = "" md_content = ""
presentation = pptx.Presentation(local_path) file_obj = input.read_file(mode="rb")
presentation = pptx.Presentation(file_obj)
file_obj.close()
slide_num = 0 slide_num = 0
for slide in presentation.slides: for slide in presentation.slides:
slide_num += 1 slide_num += 1

View File

@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class RssConverter(DocumentConverter): class RssConverter(DocumentConverter):
@@ -15,16 +16,21 @@ class RssConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not RSS type # Bail if not RSS type
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]: if extension.lower() not in [".xml", ".rss", ".atom"]:
return None return None
# Read file object from input
file_obj = input.read_file(mode="rb")
try: try:
doc = minidom.parse(local_path) doc = minidom.parse(file_obj)
except BaseException as _: except BaseException as _:
return None return None
file_obj.close()
result = None result = None
if doc.getElementsByTagName("rss"): if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss> # A RSS feed must have a root element of <rss>

View File

@@ -1,6 +1,7 @@
from typing import Union from typing import Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
from ._converter_input import ConverterInput
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -22,12 +23,19 @@ class WavConverter(MediaConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV # Bail if not a WAV
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".wav": if extension.lower() != ".wav":
return None return None
# Bail if a local path was not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
md_content = "" md_content = ""
# Add metadata # Add metadata

View File

@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._markdownify import _CustomMarkdownify from ._markdownify import _CustomMarkdownify
from ._converter_input import ConverterInput
class WikipediaConverter(DocumentConverter): class WikipediaConverter(DocumentConverter):
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not Wikipedia # Bail if not Wikipedia
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -28,8 +29,9 @@ class WikipediaConverter(DocumentConverter):
# Parse the file # Parse the file
soup = None soup = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(fh.read(), "html.parser") soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):

View File

@@ -4,6 +4,7 @@ import pandas as pd
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._converter_input import ConverterInput
class XlsxConverter(HtmlConverter): class XlsxConverter(HtmlConverter):
@@ -16,13 +17,18 @@ class XlsxConverter(HtmlConverter):
): ):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX # Bail if not a XLSX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".xlsx": if extension.lower() != ".xlsx":
return None return None
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl") file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
file_obj.close()
md_content = "" md_content = ""
for s in sheets: for s in sheets:
md_content += f"## {s}\n" md_content += f"## {s}\n"
@@ -40,13 +46,18 @@ class XlsConverter(HtmlConverter):
Converts XLS files to Markdown, with each sheet presented as a separate Markdown table. Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(
self, input: ConverterInput, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a XLS # Bail if not a XLS
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".xls": if extension.lower() != ".xls":
return None return None
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd") file_obj = input.read_file(mode="rb")
sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
file_obj.close()
md_content = "" md_content = ""
for s in sheets: for s in sheets:
md_content += f"## {s}\n" md_content += f"## {s}\n"

View File

@@ -1,10 +1,12 @@
import re import re
import json
from typing import Any, Union, Dict, List from typing import Any, Union, Dict, List
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
# Optional YouTube transcription support # Optional YouTube transcription support
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not YouTube # Bail if not YouTube
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -37,8 +39,9 @@ class YouTubeConverter(DocumentConverter):
# Parse the file # Parse the file
soup = None soup = None
with open(local_path, "rt", encoding="utf-8") as fh: file_obj = input.read_file(mode="rt", encoding="utf-8")
soup = BeautifulSoup(fh.read(), "html.parser") soup = BeautifulSoup(file_obj.read(), "html.parser")
file_obj.close()
# Read the meta tags # Read the meta tags
assert soup.title is not None and soup.title.string is not None assert soup.title is not None and soup.title.string is not None

View File

@@ -4,6 +4,7 @@ import shutil
from typing import Any, Union from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._converter_input import ConverterInput
class ZipConverter(DocumentConverter): class ZipConverter(DocumentConverter):
@@ -51,13 +52,18 @@ class ZipConverter(DocumentConverter):
super().__init__(priority=priority) super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, input: ConverterInput, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP # Bail if not a ZIP
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip": if extension.lower() != ".zip":
return None return None
# Bail if a local path is not provided
if input.input_type != "filepath":
return None
local_path = input.filepath
# Get parent converters list if available # Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", []) parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters: if not parent_converters:
@@ -111,7 +117,11 @@ class ZipConverter(DocumentConverter):
if isinstance(converter, ZipConverter): if isinstance(converter, ZipConverter):
continue continue
result = converter.convert(file_path, **file_kwargs) # Create a ConverterInput for the parent converter and attempt conversion
input = ConverterInput(
input_type="filepath", filepath=file_path
)
result = converter.convert(input, **file_kwargs)
if result is not None: if result is not None:
md_content += f"\n## File: {relative_path}\n\n" md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n" md_content += result.text_content + "\n\n"

View File

@@ -189,7 +189,7 @@ def test_markitdown_remote() -> None:
# assert test_string in result.text_content # assert test_string in result.text_content
def test_markitdown_local() -> None: def test_markitdown_local_paths() -> None:
markitdown = MarkItDown() markitdown = MarkItDown()
# Test XLSX processing # Test XLSX processing
@@ -272,6 +272,87 @@ def test_markitdown_local() -> None:
assert "# Test" in result.text_content assert "# Test" in result.text_content
def test_markitdown_local_objects() -> None:
markitdown = MarkItDown()
# Test XLSX processing
with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f:
result = markitdown.convert(f, file_extension=".xlsx")
validate_strings(result, XLSX_TEST_STRINGS)
# Test XLS processing
with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f:
result = markitdown.convert(f, file_extension=".xls")
for test_string in XLS_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing
with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f:
result = markitdown.convert(f, file_extension=".docx")
validate_strings(result, DOCX_TEST_STRINGS)
# Test DOCX processing, with comments
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
result = markitdown.convert(
f,
file_extension=".docx",
style_map="comment-reference => ",
)
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
result = markitdown_with_style_map.convert(f, file_extension=".docx")
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
# Test PPTX processing
with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f:
result = markitdown.convert(f, file_extension=".pptx")
validate_strings(result, PPTX_TEST_STRINGS)
# Test HTML processing
with open(
os.path.join(TEST_FILES_DIR, "test_blog.html"), "rt", encoding="utf-8"
) as f:
result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL)
validate_strings(result, BLOG_TEST_STRINGS)
# Test Wikipedia processing
with open(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rt", encoding="utf-8"
) as f:
result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL)
text_content = result.text_content.replace("\\", "")
validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
# Test Bing processing
with open(
os.path.join(TEST_FILES_DIR, "test_serp.html"), "rt", encoding="utf-8"
) as f:
result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL)
text_content = result.text_content.replace("\\", "")
validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
# Test RSS processing
with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f:
result = markitdown.convert(f, file_extension=".xml")
text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS:
assert test_string in text_content
# Test MSG (Outlook email) processing
with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f:
result = markitdown.convert(f, file_extension=".msg")
validate_strings(result, MSG_TEST_STRINGS)
# Test JSON processing
with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f:
result = markitdown.convert(f, file_extension=".json")
validate_strings(result, JSON_TEST_STRINGS)
@pytest.mark.skipif( @pytest.mark.skipif(
skip_exiftool, skip_exiftool,
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
@@ -328,7 +409,8 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local_paths()
test_markitdown_local_objects()
test_markitdown_exiftool() test_markitdown_exiftool()
# test_markitdown_llm() # test_markitdown_llm()
print("All tests passed!") print("All tests passed!")