Added priority argument to all converter constructors. (#324)

* Added priority argument to all converter constructors.
This commit is contained in:
afourney
2025-02-11 12:36:32 -08:00
committed by GitHub
parent 5ce85c236c
commit 935da9976c
21 changed files with 135 additions and 19 deletions

View File

@@ -47,10 +47,6 @@ from ._exceptions import (
# Override mimetype for csv to fix issue on windows # Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv") mimetypes.add_type("text/csv", ".csv")
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
PRIORITY_GENERIC_FILE_FORMAT = 10.0
_plugins: Union[None | List[Any]] = None _plugins: Union[None | List[Any]] = None
@@ -123,6 +119,8 @@ class MarkItDown:
self._llm_model = kwargs.get("llm_model") self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path") self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map") self._style_map = kwargs.get("style_map")
if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
# Register converters for successful browsing operations # Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations # Later registrations are tried first / take higher priority than earlier registrations
@@ -349,11 +347,10 @@ class MarkItDown:
_kwargs["_parent_converters"] = self._page_converters _kwargs["_parent_converters"] = self._page_converters
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
# try: try:
if True:
res = converter.convert(local_path, **_kwargs) res = converter.convert(local_path, **_kwargs)
# except Exception: except Exception:
# error_trace = ("\n\n" + traceback.format_exc()).strip() error_trace = ("\n\n" + traceback.format_exc()).strip()
if res is not None: if res is not None:
# Normalize the content # Normalize the content

View File

@@ -12,7 +12,36 @@ class DocumentConverterResult:
class DocumentConverter: class DocumentConverter:
"""Abstract superclass of all DocumentConverters.""" """Abstract superclass of all DocumentConverters."""
def __init__(self, priority: float = 0.0): # Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
"""
Initialize the DocumentConverter with a given priority.
Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
with lower values being tried first (i.e., higher priority).
Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.
We have tight control over the order of built-in converters, but
plugins can register converters in any order. A converter's priority
field reasserts some control over the order of converters.
Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._priority = priority self._priority = priority
def convert( def convert(

View File

@@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
NOTE: It is better to use the Bing API NOTE: It is better to use the Bing API
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP # Bail if not a Bing SERP
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")

View File

@@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
def __init__( def __init__(
self, self,
*,
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
endpoint: str, endpoint: str,
api_version: str = "2024-07-31-preview", api_version: str = "2024-07-31-preview",
): ):
super().__init__(priority=priority)
self.endpoint = endpoint self.endpoint = endpoint
self.api_version = api_version self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient( self.doc_intel_client = DocumentIntelligenceClient(

View File

@@ -6,6 +6,7 @@ from ._base import (
DocumentConverterResult, DocumentConverterResult,
) )
from ._base import DocumentConverter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
@@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX # Bail if not a DOCX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")

View File

@@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify
class HtmlConverter(DocumentConverter): class HtmlConverter(DocumentConverter):
"""Anything with content type text/html""" """Anything with content type text/html"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -1,5 +1,5 @@
from typing import Union from typing import Union
from ._base import DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
@@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image # Bail if not an image
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")

View File

@@ -12,6 +12,11 @@ from .._exceptions import FileConversionException
class IpynbConverter(DocumentConverter): class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown.""" """Converts Jupyter Notebook (.ipynb) files to Markdown."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio) Abstract class for multi-modal media (e.g., images and audio)
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_metadata(self, local_path, exiftool_path=None): def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
@@ -27,10 +32,10 @@ This warning will be removed in future releases.
return None return None
else: else:
try: if True:
result = subprocess.run( result = subprocess.run(
[exiftool_path, "-json", local_path], capture_output=True, text=True [exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout ).stdout
return json.loads(result)[0] return json.loads(result)[0]
except Exception: # except Exception:
return None # return None

View File

@@ -1,6 +1,6 @@
import tempfile import tempfile
from typing import Union from typing import Union
from ._base import DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings from warnings import resetwarnings, catch_warnings
@@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3 # Bail if not a MP3
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")

View File

@@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
- Email body content - Email body content
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF # Bail if not a PDF
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")

View File

@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
class PlainTextConverter(DocumentConverter): class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain""" """Anything with content type text/plain"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text. Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_llm_description( def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None self, llm_client, llm_model, image_blob, content_type, prompt=None
): ):

View File

@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
class RssConverter(DocumentConverter): class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown""" """Convert RSS / Atom type to markdown"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -1,5 +1,5 @@
from typing import Union from typing import Union
from ._base import DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter from ._media_converter import MediaConverter
# Optional Transcription support # Optional Transcription support
@@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV # Bail if not a WAV
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")

View File

@@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify
class WikipediaConverter(DocumentConverter): class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content.""" """Handle Wikipedia pages separately, focusing only on the main document content."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -2,7 +2,7 @@ from typing import Union
import pandas as pd import pandas as pd
from ._base import DocumentConverterResult from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
@@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX # Bail if not a XLSX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")

View File

@@ -19,6 +19,11 @@ except ModuleNotFoundError:
class YouTubeConverter(DocumentConverter): class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript.""" """Handle YouTube specially, focusing on the video title, description, and transcript."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
- Cleans up temporary files after processing - Cleans up temporary files after processing
""" """
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert( def convert(
self, local_path: str, **kwargs: Any self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]: ) -> Union[None, DocumentConverterResult]:

View File

@@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
# test_markitdown_remote() test_markitdown_remote()
# test_markitdown_local() test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm() # test_markitdown_llm()
print("All tests passed!")