Update converter API, user streams rather than file paths (#1088)

* Updated DocumentConverter interface * Updated all DocumentConverter classes * Added support for various new audio files. * Updated sample plugin to new DocumentConverter interface. * Updated project README with notes about changes, and use-cases. * Updated DocumentConverter documentation. * Move priority to outside DocumentConverter, allowing them to be reprioritized, and keeping the DocumentConverter interface simple. --------- Co-authored-by: Kenny Zhang <kzhang678@gmail.com>
2025-03-05 21:16:55 -08:00
parent 1d2f231146
commit e921497f79
47 changed files with 2329 additions and 1264 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -6,8 +6,10 @@ import sys
 import tempfile
 import warnings
 import traceback
+import io
+from dataclasses import dataclass
 from importlib.metadata import entry_points
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Union, BinaryIO
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
@@ -16,9 +18,9 @@ from warnings import warn
 import puremagic
 import requests

+from ._stream_info import StreamInfo, _guess_stream_info_from_stream
+
 from .converters import (
-    DocumentConverter,
-    DocumentConverterResult,
    PlainTextConverter,
    HtmlConverter,
    RssConverter,
@@ -32,26 +34,34 @@ from .converters import (
    XlsConverter,
    PptxConverter,
    ImageConverter,
-    WavConverter,
-    Mp3Converter,
+    AudioConverter,
    OutlookMsgConverter,
    ZipConverter,
    DocumentIntelligenceConverter,
 )

+from ._base_converter import DocumentConverter, DocumentConverterResult
+
 from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
    FailedConversionAttempt,
 )

-# Override mimetype for csv to fix issue on windows
-mimetypes.add_type("text/csv", ".csv")

-_plugins: Union[None | List[Any]] = None
+# Lower priority values are tried first.
+PRIORITY_SPECIFIC_FILE_FORMAT = (
+    0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
+)
+PRIORITY_GENERIC_FILE_FORMAT = (
+    10.0  # Near catch-all converters for mimetypes like text/*, etc.
+)


-def _load_plugins() -> Union[None | List[Any]]:
+_plugins: List[Any] = []
+
+
+def _load_plugins() -> List[Any]:
    """Lazy load plugins, exiting early if already loaded."""
    global _plugins

@@ -71,6 +81,14 @@ def _load_plugins() -> Union[None | List[Any]]:
    return _plugins


+@dataclass(kw_only=True, frozen=True)
+class ConverterRegistration:
+    """A registration of a converter with its priority and other metadata."""
+
+    converter: DocumentConverter
+    priority: float
+
+
 class MarkItDown:
    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
    This reader will convert common file-types or webpages to Markdown."""
@@ -92,13 +110,13 @@ class MarkItDown:
            self._requests_session = requests_session

        # TODO - remove these (see enable_builtins)
-        self._llm_client = None
-        self._llm_model = None
-        self._exiftool_path = None
-        self._style_map = None
+        self._llm_client: Any = None
+        self._llm_model: Union[str | None] = None
+        self._exiftool_path: Union[str | None] = None
+        self._style_map: Union[str | None] = None

        # Register the converters
-        self._page_converters: List[DocumentConverter] = []
+        self._converters: List[ConverterRegistration] = []

        if (
            enable_builtins is None or enable_builtins
@@ -126,9 +144,15 @@ class MarkItDown:
            # Register converters for successful browsing operations
            # Later registrations are tried first / take higher priority than earlier registrations
            # To this end, the most specific converters should appear below the most generic converters
-            self.register_converter(PlainTextConverter())
-            self.register_converter(ZipConverter())
-            self.register_converter(HtmlConverter())
+            self.register_converter(
+                PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
+            self.register_converter(
+                ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
+            self.register_converter(
+                HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
+            )
            self.register_converter(RssConverter())
            self.register_converter(WikipediaConverter())
            self.register_converter(YouTubeConverter())
@@ -137,8 +161,7 @@ class MarkItDown:
            self.register_converter(XlsxConverter())
            self.register_converter(XlsConverter())
            self.register_converter(PptxConverter())
-            self.register_converter(WavConverter())
-            self.register_converter(Mp3Converter())
+            self.register_converter(AudioConverter())
            self.register_converter(ImageConverter())
            self.register_converter(IpynbConverter())
            self.register_converter(PdfConverter())
@@ -174,12 +197,17 @@ class MarkItDown:
            warn("Plugins converters are already enabled.", RuntimeWarning)

    def convert(
-        self, source: Union[str, requests.Response, Path], **kwargs: Any
+        self,
+        source: Union[str, requests.Response, Path, BinaryIO],
+        *,
+        stream_info: Optional[StreamInfo] = None,
+        **kwargs: Any,
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        """
        Args:
-            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
-            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
+            - source: can be a path (str or Path), url, or a requests.response object
+            - stream_info: optional stream info to use for the conversion. If None, infer from source
+            - kwargs: additional arguments to pass to the converter
        """

        # Local path or url
@@ -191,68 +219,120 @@ class MarkItDown:
            ):
                return self.convert_url(source, **kwargs)
            else:
-                return self.convert_local(source, **kwargs)
+                return self.convert_local(source, stream_info=stream_info, **kwargs)
+        # Path object
+        elif isinstance(source, Path):
+            return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Request response
        elif isinstance(source, requests.Response):
            return self.convert_response(source, **kwargs)
-        elif isinstance(source, Path):
-            return self.convert_local(source, **kwargs)
+        # Binary stream
+        elif (
+            hasattr(source, "read")
+            and callable(source.read)
+            and not isinstance(source, io.TextIOBase)
+        ):
+            return self.convert_stream(source, **kwargs)
+        else:
+            raise TypeError(
+                f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
+            )

    def convert_local(
-        self, path: Union[str, Path], **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        self,
+        path: Union[str, Path],
+        *,
+        stream_info: Optional[StreamInfo] = None,
+        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
+        url: Optional[str] = None,  # Deprecated -- use stream_info
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
        if isinstance(path, Path):
            path = str(path)
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []

-        # Get extension alternatives from the path and puremagic
-        base, ext = os.path.splitext(path)
-        self._append_ext(extensions, ext)
+        # Build a base StreamInfo object from which to start guesses
+        base_stream_info = StreamInfo(
+            local_path=path,
+            extension=os.path.splitext(path)[1],
+            filename=os.path.basename(path),
+        )

-        for g in self._guess_ext_magic(path):
-            self._append_ext(extensions, g)
+        # Extend the base_stream_info with any additional info from the arguments
+        if stream_info is not None:
+            base_stream_info = base_stream_info.copy_and_update(stream_info)

-        # Convert
-        return self._convert(path, extensions, **kwargs)
+        if file_extension is not None:
+            # Deprecated -- use stream_info
+            base_stream_info = base_stream_info.copy_and_update(
+                extension=file_extension
+            )
+
+        if url is not None:
+            # Deprecated -- use stream_info
+            base_stream_info = base_stream_info.copy_and_update(url=url)
+
+        with open(path, "rb") as fh:
+            # Prepare a list of configurations to try, starting with the base_stream_info
+            guesses: List[StreamInfo] = [base_stream_info]
+            for guess in _guess_stream_info_from_stream(
+                file_stream=fh, filename_hint=path
+            ):
+                guesses.append(base_stream_info.copy_and_update(guess))
+            return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)

-    # TODO what should stream's type be?
    def convert_stream(
-        self, stream: Any, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: deal with kwargs
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
+        self,
+        stream: BinaryIO,
+        *,
+        stream_info: Optional[StreamInfo] = None,
+        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
+        url: Optional[str] = None,  # Deprecated -- use stream_info
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        guesses: List[StreamInfo] = []

-        # Save the file locally to a temporary file. It will be deleted before this method exits
-        handle, temp_path = tempfile.mkstemp()
-        fh = os.fdopen(handle, "wb")
-        result = None
-        try:
-            # Write to the temporary file
-            content = stream.read()
-            if isinstance(content, str):
-                fh.write(content.encode("utf-8"))
+        # Do we have anything on which to base a guess?
+        base_guess = None
+        if stream_info is not None or file_extension is not None or url is not None:
+            # Start with a non-Null base guess
+            if stream_info is None:
+                base_guess = StreamInfo()
            else:
-                fh.write(content)
-            fh.close()
+                base_guess = stream_info

-            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
-                self._append_ext(extensions, g)
+            if file_extension is not None:
+                # Deprecated -- use stream_info
+                assert base_guess is not None  # for mypy
+                base_guess = base_guess.copy_and_update(extension=file_extension)

-            # Convert
-            result = self._convert(temp_path, extensions, **kwargs)
-        # Clean up
-        finally:
-            try:
-                fh.close()
-            except Exception:
-                pass
-            os.unlink(temp_path)
+            if url is not None:
+                # Deprecated -- use stream_info
+                assert base_guess is not None  # for mypy
+                base_guess = base_guess.copy_and_update(url=url)

-        return result
+        # Append the base guess, if it's non-trivial
+        if base_guess is not None:
+            if base_guess.mimetype is not None or base_guess.extension is not None:
+                guesses.append(base_guess)
+        else:
+            # Create a base guess with no information
+            base_guess = StreamInfo()
+
+        # Create a placeholder filename to help with guessing
+        placeholder_filename = None
+        if base_guess.filename is not None:
+            placeholder_filename = base_guess.filename
+        elif base_guess.extension is not None:
+            placeholder_filename = "placeholder" + base_guess.extension
+
+        # Add guesses based on stream content
+        for guess in _guess_stream_info_from_stream(
+            file_stream=stream, filename_hint=placeholder_filename
+        ):
+            guesses.append(base_guess.copy_and_update(guess))
+
+        # Perform the conversion
+        return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)

    def convert_url(
        self, url: str, **kwargs: Any
@@ -263,55 +343,94 @@ class MarkItDown:
        return self.convert_response(response, **kwargs)

    def convert_response(
-        self, response: requests.Response, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO fix kwargs type
-        # Prepare a list of extensions to try (in order of priority)
-        ext = kwargs.get("file_extension")
-        extensions = [ext] if ext is not None else []
+        self,
+        response: requests.Response,
+        *,
+        stream_info: Optional[StreamInfo] = None,
+        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
+        url: Optional[str] = None,  # Deprecated -- use stream_info
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        # If there is a content-type header, get the mimetype and charset (if present)
+        mimetype: Optional[str] = None
+        charset: Optional[str] = None

-        # Guess from the mimetype
-        content_type = response.headers.get("content-type", "").split(";")[0]
-        self._append_ext(extensions, mimetypes.guess_extension(content_type))
+        if "content-type" in response.headers:
+            parts = response.headers["content-type"].split(";")
+            mimetype = parts.pop(0).strip()
+            for part in parts:
+                if part.strip().startswith("charset="):
+                    _charset = part.split("=")[1].strip()
+                    if len(_charset) > 0:
+                        charset = _charset

-        # Read the content disposition if there is one
-        content_disposition = response.headers.get("content-disposition", "")
-        m = re.search(r"filename=([^;]+)", content_disposition)
-        if m:
-            base, ext = os.path.splitext(m.group(1).strip("\"'"))
-            self._append_ext(extensions, ext)
+        # If there is a content-disposition header, get the filename and possibly the extension
+        filename: Optional[str] = None
+        extension: Optional[str] = None
+        if "content-disposition" in response.headers:
+            m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
+            if m:
+                filename = m.group(1).strip("\"'")
+                _, _extension = os.path.splitext(filename)
+                if len(_extension) > 0:
+                    extension = _extension

-        # Read from the extension from the path
-        base, ext = os.path.splitext(urlparse(response.url).path)
-        self._append_ext(extensions, ext)
+        # If there is still no filename, try to read it from the url
+        if filename is None:
+            parsed_url = urlparse(response.url)
+            _, _extension = os.path.splitext(parsed_url.path)
+            if len(_extension) > 0:  # Looks like this might be a file!
+                filename = os.path.basename(parsed_url.path)
+                extension = _extension

-        # Save the file locally to a temporary file. It will be deleted before this method exits
-        handle, temp_path = tempfile.mkstemp()
-        fh = os.fdopen(handle, "wb")
-        result = None
-        try:
-            # Download the file
-            for chunk in response.iter_content(chunk_size=512):
-                fh.write(chunk)
-            fh.close()
+        # Create an initial guess from all this information
+        base_guess = StreamInfo(
+            mimetype=mimetype,
+            charset=charset,
+            filename=filename,
+            extension=extension,
+            url=response.url,
+        )

-            # Use puremagic to check for more extension options
-            for g in self._guess_ext_magic(temp_path):
-                self._append_ext(extensions, g)
+        # Update with any additional info from the arguments
+        if stream_info is not None:
+            base_guess = base_guess.copy_and_update(stream_info)
+        if file_extension is not None:
+            # Deprecated -- use stream_info
+            base_guess = base_guess.copy_and_update(extension=file_extension)
+        if url is not None:
+            # Deprecated -- use stream_info
+            base_guess = base_guess.copy_and_update(url=url)

-            # Convert
-            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
-        # Clean up
-        finally:
-            try:
-                fh.close()
-            except Exception:
-                pass
-            os.unlink(temp_path)
+        # Add the guess if its non-trivial
+        guesses: List[StreamInfo] = []
+        if base_guess.mimetype is not None or base_guess.extension is not None:
+            guesses.append(base_guess)

-        return result
+        # Read into BytesIO
+        buffer = io.BytesIO()
+        for chunk in response.iter_content(chunk_size=512):
+            buffer.write(chunk)
+        buffer.seek(0)
+
+        # Create a placeholder filename to help with guessing
+        placeholder_filename = None
+        if base_guess.filename is not None:
+            placeholder_filename = base_guess.filename
+        elif base_guess.extension is not None:
+            placeholder_filename = "placeholder" + base_guess.extension
+
+        # Add guesses based on stream content
+        for guess in _guess_stream_info_from_stream(
+            file_stream=buffer, filename_hint=placeholder_filename
+        ):
+            guesses.append(base_guess.copy_and_update(guess))
+
+        # Convert
+        return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)

    def _convert(
-        self, local_path: str, extensions: List[Union[str, None]], **kwargs
+        self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
    ) -> DocumentConverterResult:
        res: Union[None, DocumentConverterResult] = None

@@ -321,19 +440,21 @@ class MarkItDown:
        # Create a copy of the page_converters list, sorted by priority.
        # We do this with each call to _convert because the priority of converters may change between calls.
        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
-        sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
+        sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
+
+        # Remember the initial stream position so that we can return to it
+        cur_pos = file_stream.tell()
+
+        for stream_info in stream_info_guesses + [StreamInfo()]:
+            for converter_registration in sorted_registrations:
+                converter = converter_registration.converter
+                # Sanity check -- make sure the cur_pos is still the same
+                assert (
+                    cur_pos == file_stream.tell()
+                ), f"File stream position should NOT change between guess iterations"

-        for ext in extensions + [None]:  # Try last with no extension
-            for converter in sorted_converters:
                _kwargs = copy.deepcopy(kwargs)

-                # Overwrite file_extension appropriately
-                if ext is None:
-                    if "file_extension" in _kwargs:
-                        del _kwargs["file_extension"]
-                else:
-                    _kwargs.update({"file_extension": ext})
-
                # Copy any additional global options
                if "llm_client" not in _kwargs and self._llm_client is not None:
                    _kwargs["llm_client"] = self._llm_client
@@ -348,17 +469,40 @@ class MarkItDown:
                    _kwargs["exiftool_path"] = self._exiftool_path

                # Add the list of converters for nested processing
-                _kwargs["_parent_converters"] = self._page_converters
+                _kwargs["_parent_converters"] = self._converters

-                # If we hit an error log it and keep trying
+                # Add legaxy kwargs
+                if stream_info is not None:
+                    if stream_info.extension is not None:
+                        _kwargs["file_extension"] = stream_info.extension
+
+                    if stream_info.url is not None:
+                        _kwargs["url"] = stream_info.url
+
+                # Check if the converter will accept the file, and if so, try to convert it
+                _accepts = False
                try:
-                    res = converter.convert(local_path, **_kwargs)
-                except Exception:
-                    failed_attempts.append(
-                        FailedConversionAttempt(
-                            converter=converter, exc_info=sys.exc_info()
+                    _accepts = converter.accepts(file_stream, stream_info, **_kwargs)
+                except NotImplementedError:
+                    pass
+
+                # accept() should not have changed the file stream position
+                assert (
+                    cur_pos == file_stream.tell()
+                ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
+
+                # Attempt the conversion
+                if _accepts:
+                    try:
+                        res = converter.convert(file_stream, stream_info, **_kwargs)
+                    except Exception:
+                        failed_attempts.append(
+                            FailedConversionAttempt(
+                                converter=converter, exc_info=sys.exc_info()
+                            )
                        )
-                    )
+                    finally:
+                        file_stream.seek(cur_pos)

                if res is not None:
                    # Normalize the content
@@ -366,8 +510,6 @@ class MarkItDown:
                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
                    )
                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
-
-                    # Todo
                    return res

        # If we got this far without success, report any exceptions
@@ -376,61 +518,9 @@ class MarkItDown:

        # Nothing can handle it!
        raise UnsupportedFormatException(
-            f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
+            f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
        )

-    def _append_ext(self, extensions, ext):
-        """Append a unique non-None, non-empty extension to a list of extensions."""
-        if ext is None:
-            return
-        ext = ext.strip()
-        if ext == "":
-            return
-        if ext in extensions:
-            return
-        extensions.append(ext)
-
-    def _guess_ext_magic(self, path):
-        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
-        # Use puremagic to guess
-        try:
-            guesses = puremagic.magic_file(path)
-
-            # Fix for: https://github.com/microsoft/markitdown/issues/222
-            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
-            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
-            # (space, tab, newline, carriage return, vertical tab, form feed).
-            if len(guesses) == 0:
-                with open(path, "rb") as file:
-                    while True:
-                        char = file.read(1)
-                        if not char:  # End of file
-                            break
-                        if not char.isspace():
-                            file.seek(file.tell() - 1)
-                            break
-                    try:
-                        guesses = puremagic.magic_stream(file)
-                    except puremagic.main.PureError:
-                        pass
-
-            extensions = list()
-            for g in guesses:
-                ext = g.extension.strip()
-                if len(ext) > 0:
-                    if not ext.startswith("."):
-                        ext = "." + ext
-                    if ext not in extensions:
-                        extensions.append(ext)
-            return extensions
-        except FileNotFoundError:
-            pass
-        except IsADirectoryError:
-            pass
-        except PermissionError:
-            pass
-        return []
-
    def register_page_converter(self, converter: DocumentConverter) -> None:
        """DEPRECATED: User register_converter instead."""
        warn(
@@ -439,6 +529,34 @@ class MarkItDown:
        )
        self.register_converter(converter)

-    def register_converter(self, converter: DocumentConverter) -> None:
-        """Register a page text converter."""
-        self._page_converters.insert(0, converter)
+    def register_converter(
+        self,
+        converter: DocumentConverter,
+        *,
+        priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
+    ) -> None:
+        """
+        Register a DocumentConverter with a given priority.
+
+        Priorities work as follows: By default, most converters get priority
+        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
+        is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
+        priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
+        being tried first (i.e., higher priority).
+
+        Just prior to conversion, the converters are sorted by priority, using
+        a stable sort. This means that converters with the same priority will
+        remain in the same order, with the most recently registered converters
+        appearing first.
+
+        We have tight control over the order of built-in converters, but
+        plugins can register converters in any order. The registration's priority
+        field reasserts some control over the order of converters.
+
+        Plugins can register converters with any priority, to appear before or
+        after the built-ins. For example, a plugin with priority 9 will run
+        before the PlainTextConverter, but after the built-in converters.
+        """
+        self._converters.insert(
+            0, ConverterRegistration(converter=converter, priority=priority)
+        )