Added priority argument to all converter constructors. (#324)

* Added priority argument to all converter constructors.
2025-02-11 12:36:32 -08:00
parent 5ce85c236c
commit 935da9976c
21 changed files with 135 additions and 19 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -47,10 +47,6 @@ from ._exceptions import (
 # Override mimetype for csv to fix issue on windows
 mimetypes.add_type("text/csv", ".csv")
 PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
 PRIORITY_GENERIC_FILE_FORMAT = 10.0
 _plugins: Union[None | List[Any]] = None
@@ -123,6 +119,8 @@ class MarkItDown:
            self._llm_model = kwargs.get("llm_model")
            self._exiftool_path = kwargs.get("exiftool_path")
            self._style_map = kwargs.get("style_map")
            if self._exiftool_path is None:
                self._exiftool_path = os.getenv("EXIFTOOL_PATH")
            # Register converters for successful browsing operations
            # Later registrations are tried first / take higher priority than earlier registrations
@@ -349,11 +347,10 @@ class MarkItDown:
                _kwargs["_parent_converters"] = self._page_converters
                # If we hit an error log it and keep trying
-                # try:
+                try:
                if True:
                    res = converter.convert(local_path, **_kwargs)
-                # except Exception:
+                except Exception:
-                #    error_trace = ("\n\n" + traceback.format_exc()).strip()
+                    error_trace = ("\n\n" + traceback.format_exc()).strip()
                if res is not None:
                    # Normalize the content
--- a/packages/markitdown/src/markitdown/converters/_base.py
+++ b/packages/markitdown/src/markitdown/converters/_base.py
@@ -12,7 +12,36 @@ class DocumentConverterResult:
 class DocumentConverter:
    """Abstract superclass of all DocumentConverters."""
-    def __init__(self, priority: float = 0.0):
+    # Lower priority values are tried first.
    PRIORITY_SPECIFIC_FILE_FORMAT = (
        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
    )
    PRIORITY_GENERIC_FILE_FORMAT = (
        10.0  # Near catch-all converters for mimetypes like text/*, etc.
    )
    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
        """
        Initialize the DocumentConverter with a given priority.
        Priorities work as follows: By default, most converters get priority
        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
        with lower values being tried first (i.e., higher priority).
        Just prior to conversion, the converters are sorted by priority, using
        a stable sort. This means that converters with the same priority will
        remain in the same order, with the most recently registered converters
        appearing first.
        We have tight control over the order of built-in converters, but
        plugins can register converters in any order. A converter's priority
        field reasserts some control over the order of converters.
        Plugins can register converters with any priority, to appear before or
        after the built-ins. For example, a plugin with priority 9 will run
        before the PlainTextConverter, but after the built-in converters.
        """
        self._priority = priority
    def convert(
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
    NOTE: It is better to use the Bing API
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a Bing SERP
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
    def __init__(
        self,
        *,
        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
    ):
        super().__init__(priority=priority)
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -6,6 +6,7 @@ from ._base import (
    DocumentConverterResult,
 )
 from ._base import DocumentConverter
 from ._html_converter import HtmlConverter
@@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a DOCX
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify
 class HtmlConverter(DocumentConverter):
    """Anything with content type text/html"""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -1,5 +1,5 @@
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
@@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not an image
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -12,6 +12,11 @@ from .._exceptions import FileConversionException
 class IpynbConverter(DocumentConverter):
    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_media_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_media_converter.py
@@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
    Abstract class for multi-modal media (e.g., images and audio)
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def _get_metadata(self, local_path, exiftool_path=None):
        if not exiftool_path:
            which_exiftool = shutil.which("exiftool")
@@ -27,10 +32,10 @@ This warning will be removed in future releases.
            return None
        else:
-            try:
+            if True:
                result = subprocess.run(
                    [exiftool_path, "-json", local_path], capture_output=True, text=True
                ).stdout
                return json.loads(result)[0]
-            except Exception:
+            # except Exception:
-                return None
+            #    return None
--- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@@ -1,6 +1,6 @@
 import tempfile
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings
@@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a MP3
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
    - Email body content
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a PDF
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def _get_llm_description(
        self, llm_client, llm_model, image_blob, content_type, prompt=None
    ):
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_wav_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@@ -1,5 +1,5 @@
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
 # Optional Transcription support
@@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a WAV
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify
 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -2,7 +2,7 @@ from typing import Union
 import pandas as pd
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
@@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a XLSX
        extension = kwargs.get("file_extension", "")
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -19,6 +19,11 @@ except ModuleNotFoundError:
 class YouTubeConverter(DocumentConverter):
    """Handle YouTube specially, focusing on the video title, description, and transcript."""
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/src/markitdown/converters/_zip_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
    - Cleans up temporary files after processing
    """
    def __init__(
        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
    ):
        super().__init__(priority=priority)
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
--- a/packages/markitdown/tests/test_markitdown.py
+++ b/packages/markitdown/tests/test_markitdown.py
@@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
-    # test_markitdown_remote()
+    test_markitdown_remote()
-    # test_markitdown_local()
+    test_markitdown_local()
    test_markitdown_exiftool()
    # test_markitdown_deprecation()
    # test_markitdown_llm()
    print("All tests passed!")