ran unit tests locally

formatting
updated readme
2025-02-27 16:44:50 -05:00 · 2025-02-27 15:08:10 -05:00 · 2025-02-27 15:07:46 -05:00 · 2025-02-27 15:05:29 -05:00 · 2025-02-27 14:55:49 -05:00 · 2025-02-27 11:27:05 -05:00
22 changed files with 361 additions and 94 deletions
--- a/README.md
+++ b/README.md
@@ -97,6 +97,25 @@ result = md.convert("test.pdf")
 print(result.text_content)
 ```
 MarkItDown also supports converting file objects directly:
 ```python
 from markitdown import MarkItDown
 md = MarkItDown()
 # Providing the file extension when converting via file objects is recommended for most consistent results
 # Binary Mode
 with open("test.docx", 'rb') as file:
    result = md.convert(file, file_extension=".docx")
    print(result.text_content)
 # Non-Binary Mode
 with open("sample.ipynb", 'rt', encoding="utf-8") as file:
    result = md.convert(file, file_extension=".ipynb")
    print(result.text_content)
 ```
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
 ```python
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -10,6 +10,7 @@ from typing import Any, List, Optional, Union
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
 from io import BufferedIOBase, TextIOBase, BytesIO
 # File-format detection
 import puremagic
@@ -36,6 +37,7 @@ from .converters import (
    OutlookMsgConverter,
    ZipConverter,
    DocumentIntelligenceConverter,
    ConverterInput,
 )
 from ._exceptions import (
@@ -173,14 +175,15 @@ class MarkItDown:
            warn("Plugins converters are already enabled.", RuntimeWarning)
    def convert(
-        self, source: Union[str, requests.Response, Path], **kwargs: Any
+        self,
        source: Union[str, requests.Response, Path, BufferedIOBase, TextIOBase],
        **kwargs: Any,
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        """
        Args:
-            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
+            - source: can be a string representing a path either as string pathlib path object or url, a requests.response object, or a file object (TextIO or BinaryIO)
            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
        """
        # Local path or url
        if isinstance(source, str):
            if (
@@ -196,6 +199,9 @@ class MarkItDown:
            return self.convert_response(source, **kwargs)
        elif isinstance(source, Path):
            return self.convert_local(source, **kwargs)
        # File object
        elif isinstance(source, BufferedIOBase) or isinstance(source, TextIOBase):
            return self.convert_file_object(source, **kwargs)
    def convert_local(
        self, path: Union[str, Path], **kwargs: Any
@@ -210,11 +216,33 @@ class MarkItDown:
        base, ext = os.path.splitext(path)
        self._append_ext(extensions, ext)
-        for g in self._guess_ext_magic(path):
+        for g in self._guess_ext_magic(source=path):
            self._append_ext(extensions, g)
        # Create the ConverterInput object
        input = ConverterInput(input_type="filepath", filepath=path)
        # Convert
-        return self._convert(path, extensions, **kwargs)
+        return self._convert(input, extensions, **kwargs)
    def convert_file_object(
        self, file_object: Union[BufferedIOBase, TextIOBase], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        # Prepare a list of extensions to try (in order of priority
        ext = kwargs.get("file_extension")
        extensions = [ext] if ext is not None else []
        # TODO: Curently, there are some ongoing issues with passing direct file objects to puremagic (incorrect guesses, unsupported file type errors, etc.)
        # Only use puremagic as a last resort if no extensions were provided
        if extensions == []:
            for g in self._guess_ext_magic(source=file_object):
                self._append_ext(extensions, g)
        # Create the ConverterInput object
        input = ConverterInput(input_type="object", file_object=file_object)
        # Convert
        return self._convert(input, extensions, **kwargs)
    # TODO what should stream's type be?
    def convert_stream(
@@ -238,11 +266,14 @@ class MarkItDown:
            fh.close()
            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
+            for g in self._guess_ext_magic(source=temp_path):
                self._append_ext(extensions, g)
            # Create the ConverterInput object
            input = ConverterInput(input_type="filepath", filepath=temp_path)
            # Convert
-            result = self._convert(temp_path, extensions, **kwargs)
+            result = self._convert(input, extensions, **kwargs)
        # Clean up
        finally:
            try:
@@ -294,11 +325,14 @@ class MarkItDown:
            fh.close()
            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
+            for g in self._guess_ext_magic(source=temp_path):
                self._append_ext(extensions, g)
            # Create the ConverterInput object
            input = ConverterInput(input_type="filepath", filepath=temp_path)
            # Convert
-            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
+            result = self._convert(input, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
            try:
@@ -310,10 +344,9 @@ class MarkItDown:
        return result
    def _convert(
-        self, local_path: str, extensions: List[Union[str, None]], **kwargs
+        self, input: ConverterInput, extensions: List[Union[str, None]], **kwargs
    ) -> DocumentConverterResult:
        error_trace = ""
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
@@ -348,7 +381,7 @@ class MarkItDown:
                # If we hit an error log it and keep trying
                try:
-                    res = converter.convert(local_path, **_kwargs)
+                    res = converter.convert(input, **_kwargs)
                except Exception:
                    error_trace = ("\n\n" + traceback.format_exc()).strip()
@@ -365,12 +398,12 @@ class MarkItDown:
        # If we got this far without success, report any exceptions
        if len(error_trace) > 0:
            raise FileConversionException(
-                f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
+                f"Could not convert '{input.filepath}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
            )
        # Nothing can handle it!
        raise UnsupportedFormatException(
-            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
+            f"Could not convert '{input.filepath}' to Markdown. The formats {extensions} are not supported."
        )
    def _append_ext(self, extensions, ext):
@@ -383,29 +416,38 @@ class MarkItDown:
        # if ext not in extensions:
        extensions.append(ext)
-    def _guess_ext_magic(self, path):
+    def _guess_ext_magic(self, source):
        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
        # Use puremagic to guess
        try:
-            guesses = puremagic.magic_file(path)
+            guesses = []
-            # Fix for: https://github.com/microsoft/markitdown/issues/222
+            # Guess extensions for filepaths
-            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
+            if isinstance(source, str):
-            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
+                guesses = puremagic.magic_file(source)
-            # (space, tab, newline, carriage return, vertical tab, form feed).
+
-            if len(guesses) == 0:
+                # Fix for: https://github.com/microsoft/markitdown/issues/222
-                with open(path, "rb") as file:
+                # If there are no guesses, then try again after trimming leading ASCII whitespaces.
-                    while True:
+                # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
-                        char = file.read(1)
+                # (space, tab, newline, carriage return, vertical tab, form feed).
-                        if not char:  # End of file
+                if len(guesses) == 0:
-                            break
+                    with open(source, "rb") as file:
-                        if not char.isspace():
+                        while True:
-                            file.seek(file.tell() - 1)
+                            char = file.read(1)
-                            break
+                            if not char:  # End of file
-                    try:
+                                break
-                        guesses = puremagic.magic_stream(file)
+                            if not char.isspace():
-                    except puremagic.main.PureError:
+                                file.seek(file.tell() - 1)
-                        pass
+                                break
                        try:
                            guesses = puremagic.magic_stream(file)
                        except puremagic.main.PureError:
                            pass
            # Guess extensions for file objects. Note that the puremagic's magic_stream function requires a BytesIO-like file source
            # TODO: Figure out how to guess extensions for TextIO-like file sources (manually converting to BytesIO does not work)
            elif isinstance(source, BufferedIOBase):
                guesses = puremagic.magic_stream(source)
            extensions = list()
            for g in guesses:
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -20,6 +20,7 @@ from ._mp3_converter import Mp3Converter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
 from ._converter_input import ConverterInput
 __all__ = [
    "DocumentConverter",
@@ -42,4 +43,5 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
    "ConverterInput",
 ]
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
 from ._converter_input import ConverterInput
 class BingSerpConverter(DocumentConverter):
@@ -21,7 +22,9 @@ class BingSerpConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a Bing SERP
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".html", ".htm"]:
@@ -36,8 +39,9 @@ class BingSerpConverter(DocumentConverter):
        # Parse the file
        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
        file_obj.close()
        # Clean up some formatting
        for tptt in soup.find_all(class_="tptt"):
--- a/packages/markitdown/src/markitdown/converters/_converter_input.py
+++ b/packages/markitdown/src/markitdown/converters/_converter_input.py
@@ -0,0 +1,30 @@
 from typing import Any, Union
 class ConverterInput:
    """
    Wrapper for inputs to converter functions.
    """
    def __init__(
        self,
        input_type: str = "filepath",
        filepath: Union[str, None] = None,
        file_object: Union[Any, None] = None,
    ):
        if input_type not in ["filepath", "object"]:
            raise ValueError(f"Invalid converter input type: {input_type}")
        self.input_type = input_type
        self.filepath = filepath
        self.file_object = file_object
    def read_file(
        self,
        mode: str = "rb",
        encoding: Union[str, None] = None,
    ) -> Any:
        if self.input_type == "object":
            return self.file_object
        return open(self.filepath, mode=mode, encoding=encoding)
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -11,6 +11,7 @@ from azure.ai.documentintelligence.models import (
 from azure.identity import DefaultAzureCredential
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@@ -37,10 +38,9 @@ class DocumentIntelligenceConverter(DocumentConverter):
            api_version=self.api_version,
            credential=DefaultAzureCredential(),
        )
        self._priority = priority
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if extension is not supported by Document Intelligence
        extension = kwargs.get("file_extension", "")
@@ -60,9 +60,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
        if extension.lower() not in docintel_extensions:
            return None
-        # Get the bytestring for the local path
+        # Get the bytestring from the converter input
-        with open(local_path, "rb") as f:
+        file_obj = input.read_file(mode="rb")
-            file_bytes = f.read()
+        file_bytes = file_obj.read()
        file_obj.close()
        # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx)
        if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]:
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -8,6 +8,7 @@ from ._base import (
 from ._base import DocumentConverter
 from ._html_converter import HtmlConverter
 from ._converter_input import ConverterInput
 class DocxConverter(HtmlConverter):
@@ -20,18 +21,20 @@ class DocxConverter(HtmlConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a DOCX
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".docx":
            return None
        result = None
-        with open(local_path, "rb") as docx_file:
+        style_map = kwargs.get("style_map", None)
-            style_map = kwargs.get("style_map", None)
+        file_obj = input.read_file(mode="rb")
-
+        result = mammoth.convert_to_html(file_obj, style_map=style_map)
-            result = mammoth.convert_to_html(docx_file, style_map=style_map)
+        file_obj.close()
-            html_content = result.value
+        html_content = result.value
-            result = self._convert(html_content)
+        result = self._convert(html_content)
        return result
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
 from ._converter_input import ConverterInput
 class HtmlConverter(DocumentConverter):
@@ -14,7 +15,7 @@ class HtmlConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not html
        extension = kwargs.get("file_extension", "")
@@ -22,8 +23,9 @@ class HtmlConverter(DocumentConverter):
            return None
        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
-            result = self._convert(fh.read())
+        result = self._convert(file_obj.read())
        file_obj.close()
        return result
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -1,6 +1,7 @@
 from typing import Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
 from ._converter_input import ConverterInput
 class ImageConverter(MediaConverter):
@@ -13,7 +14,9 @@ class ImageConverter(MediaConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not an image
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
@@ -21,8 +24,9 @@ class ImageConverter(MediaConverter):
        md_content = ""
-        # Add metadata
+        # Add metadata if a local path is provided
-        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
+        if input.input_type == "filepath":
            metadata = self._get_metadata(input.filepath, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
@@ -47,7 +51,7 @@ class ImageConverter(MediaConverter):
            md_content += (
                "\n# Description:\n"
                + self._get_llm_description(
-                    local_path,
+                    input,
                    extension,
                    llm_client,
                    llm_model,
@@ -61,17 +65,20 @@ class ImageConverter(MediaConverter):
            text_content=md_content,
        )
-    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_llm_description(
        self, input: ConverterInput, extension, client, model, prompt=None
    ):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
        data_uri = ""
-        with open(local_path, "rb") as image_file:
+        content_type, encoding = mimetypes.guess_type("_dummy" + extension)
-            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+        if content_type is None:
-            if content_type is None:
+            content_type = "image/jpeg"
-                content_type = "image/jpeg"
+        image_file = input.read_file(mode="rb")
-            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+        image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
-            data_uri = f"data:{content_type};base64,{image_base64}"
+        image_file.close()
        data_uri = f"data:{content_type};base64,{image_base64}"
        messages = [
            {
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -7,6 +7,7 @@ from ._base import (
 )
 from .._exceptions import FileConversionException
 from ._converter_input import ConverterInput
 class IpynbConverter(DocumentConverter):
@@ -18,7 +19,7 @@ class IpynbConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not ipynb
        extension = kwargs.get("file_extension", "")
@@ -27,9 +28,10 @@ class IpynbConverter(DocumentConverter):
        # Parse and convert the notebook
        result = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
-            notebook_content = json.load(fh)
+        notebook_content = json.load(file_obj)
-            result = self._convert(notebook_content)
+        file_obj.close()
        result = self._convert(notebook_content)
        return result
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@@ -1,8 +1,10 @@
 import tempfile
 import os
 from typing import Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings
 from ._converter_input import ConverterInput
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -33,12 +35,19 @@ class Mp3Converter(WavConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a MP3
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".mp3":
            return None
        # Bail if a local path was not provided
        if input.input_type != "filepath":
            return None
        local_path = input.filepath
        md_content = ""
        # Add metadata
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -1,6 +1,7 @@
 import olefile
 from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 class OutlookMsgConverter(DocumentConverter):
@@ -17,7 +18,7 @@ class OutlookMsgConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a MSG file
        extension = kwargs.get("file_extension", "")
@@ -25,7 +26,9 @@ class OutlookMsgConverter(DocumentConverter):
            return None
        try:
-            msg = olefile.OleFileIO(local_path)
+            file_obj = input.read_file(mode="rb")
            msg = olefile.OleFileIO(file_obj)
            # Extract email metadata
            md_content = "# Email Message\n\n"
@@ -49,6 +52,7 @@ class OutlookMsgConverter(DocumentConverter):
                md_content += body
            msg.close()
            file_obj.close()
            return DocumentConverterResult(
                title=headers.get("Subject"), text_content=md_content.strip()
@@ -56,7 +60,7 @@ class OutlookMsgConverter(DocumentConverter):
        except Exception as e:
            raise FileConversionException(
-                f"Could not convert MSG file '{local_path}': {str(e)}"
+                f"Could not convert MSG file '{input.filepath}': {str(e)}"
            )
    def _get_stream_data(
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,7 +1,9 @@
 import pdfminer
 import pdfminer.high_level
 from typing import Union
 from io import StringIO
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 class PdfConverter(DocumentConverter):
@@ -14,13 +16,20 @@ class PdfConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a PDF
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".pdf":
            return None
        output = StringIO()
        file_obj = input.read_file(mode="rb")
        pdfminer.high_level.extract_text_to_fp(file_obj, output)
        file_obj.close()
        return DocumentConverterResult(
            title=None,
-            text_content=pdfminer.high_level.extract_text(local_path),
+            text_content=output.getvalue(),
        )
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -1,9 +1,10 @@
 import mimetypes
-from charset_normalizer import from_path
+from charset_normalizer import from_path, from_bytes
 from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 class PlainTextConverter(DocumentConverter):
@@ -15,8 +16,11 @@ class PlainTextConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Read file object from input
        file_obj = input.read_file(mode="rb")
        # Guess the content type from any file extension that might be around
        content_type, _ = mimetypes.guess_type(
            "__placeholder" + kwargs.get("file_extension", "")
@@ -31,7 +35,8 @@ class PlainTextConverter(DocumentConverter):
        ):
            return None
-        text_content = str(from_path(local_path).best())
+        text_content = str(from_bytes(file_obj.read()).best())
        file_obj.close()
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -7,6 +7,7 @@ from typing import Union
 from ._base import DocumentConverterResult, DocumentConverter
 from ._html_converter import HtmlConverter
 from ._converter_input import ConverterInput
 class PptxConverter(HtmlConverter):
@@ -48,7 +49,9 @@ class PptxConverter(HtmlConverter):
        )
        return response.choices[0].message.content
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a PPTX
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".pptx":
@@ -56,7 +59,10 @@ class PptxConverter(HtmlConverter):
        md_content = ""
-        presentation = pptx.Presentation(local_path)
+        file_obj = input.read_file(mode="rb")
        presentation = pptx.Presentation(file_obj)
        file_obj.close()
        slide_num = 0
        for slide in presentation.slides:
            slide_num += 1
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
 from ._markdownify import _CustomMarkdownify
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 class RssConverter(DocumentConverter):
@@ -15,16 +16,21 @@ class RssConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs
+        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not RSS type
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".xml", ".rss", ".atom"]:
            return None
        # Read file object from input
        file_obj = input.read_file(mode="rb")
        try:
-            doc = minidom.parse(local_path)
+            doc = minidom.parse(file_obj)
        except BaseException as _:
            return None
        file_obj.close()
        result = None
        if doc.getElementsByTagName("rss"):
            # A RSS feed must have a root element of <rss>
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@@ -1,6 +1,7 @@
 from typing import Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
 from ._converter_input import ConverterInput
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
@@ -22,12 +23,19 @@ class WavConverter(MediaConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a WAV
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".wav":
            return None
        # Bail if a local path was not provided
        if input.input_type != "filepath":
            return None
        local_path = input.filepath
        md_content = ""
        # Add metadata
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
 from ._base import DocumentConverter, DocumentConverterResult
 from ._markdownify import _CustomMarkdownify
 from ._converter_input import ConverterInput
 class WikipediaConverter(DocumentConverter):
@@ -16,7 +17,7 @@ class WikipediaConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not Wikipedia
        extension = kwargs.get("file_extension", "")
@@ -28,8 +29,9 @@ class WikipediaConverter(DocumentConverter):
        # Parse the file
        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
        file_obj.close()
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -4,6 +4,7 @@ import pandas as pd
 from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 from ._converter_input import ConverterInput
 class XlsxConverter(HtmlConverter):
@@ -16,13 +17,18 @@ class XlsxConverter(HtmlConverter):
    ):
        super().__init__(priority=priority)
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a XLSX
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".xlsx":
            return None
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        file_obj = input.read_file(mode="rb")
        sheets = pd.read_excel(file_obj, sheet_name=None, engine="openpyxl")
        file_obj.close()
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
@@ -40,13 +46,18 @@ class XlsConverter(HtmlConverter):
    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
    """
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(
        self, input: ConverterInput, **kwargs
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a XLS
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".xls":
            return None
-        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+        file_obj = input.read_file(mode="rb")
        sheets = pd.read_excel(file_obj, sheet_name=None, engine="xlrd")
        file_obj.close()
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -1,10 +1,12 @@
 import re
 import json
 from typing import Any, Union, Dict, List
 from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 # Optional YouTube transcription support
@@ -25,7 +27,7 @@ class YouTubeConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not YouTube
        extension = kwargs.get("file_extension", "")
@@ -37,8 +39,9 @@ class YouTubeConverter(DocumentConverter):
        # Parse the file
        soup = None
-        with open(local_path, "rt", encoding="utf-8") as fh:
+        file_obj = input.read_file(mode="rt", encoding="utf-8")
-            soup = BeautifulSoup(fh.read(), "html.parser")
+        soup = BeautifulSoup(file_obj.read(), "html.parser")
        file_obj.close()
        # Read the meta tags
        assert soup.title is not None and soup.title.string is not None
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -4,6 +4,7 @@ import shutil
 from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult
 from ._converter_input import ConverterInput
 class ZipConverter(DocumentConverter):
@@ -51,13 +52,18 @@ class ZipConverter(DocumentConverter):
        super().__init__(priority=priority)
    def convert(
-        self, local_path: str, **kwargs: Any
+        self, input: ConverterInput, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a ZIP
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".zip":
            return None
        # Bail if a local path is not provided
        if input.input_type != "filepath":
            return None
        local_path = input.filepath
        # Get parent converters list if available
        parent_converters = kwargs.get("_parent_converters", [])
        if not parent_converters:
@@ -111,7 +117,11 @@ class ZipConverter(DocumentConverter):
                        if isinstance(converter, ZipConverter):
                            continue
-                        result = converter.convert(file_path, **file_kwargs)
+                        # Create a ConverterInput for the parent converter and attempt conversion
                        input = ConverterInput(
                            input_type="filepath", filepath=file_path
                        )
                        result = converter.convert(input, **file_kwargs)
                        if result is not None:
                            md_content += f"\n## File: {relative_path}\n\n"
                            md_content += result.text_content + "\n\n"
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@@ -189,7 +189,7 @@ def test_markitdown_remote() -> None:
    #     assert test_string in result.text_content
-def test_markitdown_local() -> None:
+def test_markitdown_local_paths() -> None:
    markitdown = MarkItDown()
    # Test XLSX processing
@@ -272,6 +272,87 @@ def test_markitdown_local() -> None:
    assert "# Test" in result.text_content
 def test_markitdown_local_objects() -> None:
    markitdown = MarkItDown()
    # Test XLSX processing
    with open(os.path.join(TEST_FILES_DIR, "test.xlsx"), "rb") as f:
        result = markitdown.convert(f, file_extension=".xlsx")
        validate_strings(result, XLSX_TEST_STRINGS)
    # Test XLS processing
    with open(os.path.join(TEST_FILES_DIR, "test.xls"), "rb") as f:
        result = markitdown.convert(f, file_extension=".xls")
        for test_string in XLS_TEST_STRINGS:
            text_content = result.text_content.replace("\\", "")
            assert test_string in text_content
    # Test DOCX processing
    with open(os.path.join(TEST_FILES_DIR, "test.docx"), "rb") as f:
        result = markitdown.convert(f, file_extension=".docx")
        validate_strings(result, DOCX_TEST_STRINGS)
    # Test DOCX processing, with comments
    with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
        result = markitdown.convert(
            f,
            file_extension=".docx",
            style_map="comment-reference => ",
        )
        validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
    # Test DOCX processing, with comments and setting style_map on init
    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
    with open(os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), "rb") as f:
        result = markitdown_with_style_map.convert(f, file_extension=".docx")
        validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
    # Test PPTX processing
    with open(os.path.join(TEST_FILES_DIR, "test.pptx"), "rb") as f:
        result = markitdown.convert(f, file_extension=".pptx")
        validate_strings(result, PPTX_TEST_STRINGS)
    # Test HTML processing
    with open(
        os.path.join(TEST_FILES_DIR, "test_blog.html"), "rt", encoding="utf-8"
    ) as f:
        result = markitdown.convert(f, file_extension=".html", url=BLOG_TEST_URL)
        validate_strings(result, BLOG_TEST_STRINGS)
    # Test Wikipedia processing
    with open(
        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), "rt", encoding="utf-8"
    ) as f:
        result = markitdown.convert(f, file_extension=".html", url=WIKIPEDIA_TEST_URL)
        text_content = result.text_content.replace("\\", "")
        validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
    # Test Bing processing
    with open(
        os.path.join(TEST_FILES_DIR, "test_serp.html"), "rt", encoding="utf-8"
    ) as f:
        result = markitdown.convert(f, file_extension=".html", url=SERP_TEST_URL)
        text_content = result.text_content.replace("\\", "")
        validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
    # Test RSS processing
    with open(os.path.join(TEST_FILES_DIR, "test_rss.xml"), "rb") as f:
        result = markitdown.convert(f, file_extension=".xml")
        text_content = result.text_content.replace("\\", "")
        for test_string in RSS_TEST_STRINGS:
            assert test_string in text_content
    # Test MSG (Outlook email) processing
    with open(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"), "rb") as f:
        result = markitdown.convert(f, file_extension=".msg")
        validate_strings(result, MSG_TEST_STRINGS)
    # Test JSON processing
    with open(os.path.join(TEST_FILES_DIR, "test.json"), "rb") as f:
        result = markitdown.convert(f, file_extension=".json")
        validate_strings(result, JSON_TEST_STRINGS)
@pytest.mark.skipif(
    skip_exiftool,
    reason="do not run if exiftool is not installed",
@@ -328,7 +409,8 @@ def test_markitdown_llm() -> None:
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    test_markitdown_remote()
-    test_markitdown_local()
+    test_markitdown_local_paths()
    test_markitdown_local_objects()
    test_markitdown_exiftool()
    # test_markitdown_llm()
    print("All tests passed!")
Author	SHA1	Message	Date
Kenny Zhang	4e0a10ecf3	ran unit tests locally	2025-02-27 16:44:50 -05:00
Kenny Zhang	950b135da6	formatting	2025-02-27 15:08:10 -05:00
Kenny Zhang	b671345bb9	updated readme	2025-02-27 15:07:46 -05:00
Kenny Zhang	d9a92f7f06	added file obj unit tests for rss and json	2025-02-27 15:05:29 -05:00
Kenny Zhang	db0c8acbaf	added file obj support to rss and plain text converters	2025-02-27 14:55:49 -05:00
Kenny Zhang	08330c2ac3	added core unit tests for file obj support	2025-02-27 11:27:05 -05:00
Kenny Zhang	4afc1fe886	added non-binary example to README	2025-02-21 13:31:37 -05:00
Kenny Zhang	b0044720da	updated docs	2025-02-20 16:56:47 -05:00
Kenny Zhang	07a28d4f00	black formatting	2025-02-20 16:49:37 -05:00
Kenny Zhang	b8b3897952	modify ext guesser	2025-02-20 16:47:37 -05:00
Kenny Zhang	395ce2d301	close file object after using	2025-02-20 13:54:51 -05:00
Kenny Zhang	808401a331	added conversion path for file object in central class	2025-02-19 17:02:51 -05:00
Kenny Zhang	e75f3f6f5b	local path inputs to MarkitDown class adhere to new converterinput structure	2025-02-19 15:16:45 -05:00
Kenny Zhang	8e950325d2	refactored remaining converters	2025-02-19 14:01:43 -05:00
Kenny Zhang	096fef3d5f	refactored more converters to support input class	2025-02-19 13:34:28 -05:00
Kenny Zhang	52cbff061a	begin refactoring converter classes	2025-02-19 11:48:00 -05:00
Kenny Zhang	0027e6d425	added wrapper class for converter file input	2025-02-18 12:44:18 -05:00
Kenny Zhang	63a7bafadd	removed redundant priority setting	2025-02-18 12:18:49 -05:00