Added priority argument to all converter constructors. (#324)

* Added priority argument to all converter constructors.
This commit is contained in:
afourney
2025-02-11 12:36:32 -08:00
committed by GitHub
parent 5ce85c236c
commit 935da9976c
21 changed files with 135 additions and 19 deletions

View File

@@ -47,10 +47,6 @@ from ._exceptions import (
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
PRIORITY_GENERIC_FILE_FORMAT = 10.0
_plugins: Union[None | List[Any]] = None
@@ -123,6 +119,8 @@ class MarkItDown:
self._llm_model = kwargs.get("llm_model")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")
if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
@@ -349,11 +347,10 @@ class MarkItDown:
_kwargs["_parent_converters"] = self._page_converters
# If we hit an error log it and keep trying
# try:
if True:
try:
res = converter.convert(local_path, **_kwargs)
# except Exception:
# error_trace = ("\n\n" + traceback.format_exc()).strip()
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()
if res is not None:
# Normalize the content

View File

@@ -12,7 +12,36 @@ class DocumentConverterResult:
class DocumentConverter:
"""Abstract superclass of all DocumentConverters."""
def __init__(self, priority: float = 0.0):
# Lower priority values are tried first.
PRIORITY_SPECIFIC_FILE_FORMAT = (
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
)
PRIORITY_GENERIC_FILE_FORMAT = (
10.0 # Near catch-all converters for mimetypes like text/*, etc.
)
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
"""
Initialize the DocumentConverter with a given priority.
Priorities work as follows: By default, most converters get priority
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
with lower values being tried first (i.e., higher priority).
Just prior to conversion, the converters are sorted by priority, using
a stable sort. This means that converters with the same priority will
remain in the same order, with the most recently registered converters
appearing first.
We have tight control over the order of built-in converters, but
plugins can register converters in any order. A converter's priority
field reasserts some control over the order of converters.
Plugins can register converters with any priority, to appear before or
after the built-ins. For example, a plugin with priority 9 will run
before the PlainTextConverter, but after the built-in converters.
"""
self._priority = priority
def convert(

View File

@@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
NOTE: It is better to use the Bing API
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a Bing SERP
extension = kwargs.get("file_extension", "")

View File

@@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
def __init__(
self,
*,
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
super().__init__(priority=priority)
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(

View File

@@ -6,6 +6,7 @@ from ._base import (
DocumentConverterResult,
)
from ._base import DocumentConverter
from ._html_converter import HtmlConverter
@@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")

View File

@@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View File

@@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
@@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not an image
extension = kwargs.get("file_extension", "")

View File

@@ -12,6 +12,11 @@ from .._exceptions import FileConversionException
class IpynbConverter(DocumentConverter):
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View File

@@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
Abstract class for multi-modal media (e.g., images and audio)
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
@@ -27,10 +32,10 @@ This warning will be removed in future releases.
return None
else:
try:
if True:
result = subprocess.run(
[exiftool_path, "-json", local_path], capture_output=True, text=True
).stdout
return json.loads(result)[0]
except Exception:
return None
# except Exception:
# return None

View File

@@ -1,6 +1,6 @@
import tempfile
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._wav_converter import WavConverter
from warnings import resetwarnings, catch_warnings
@@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a MP3
extension = kwargs.get("file_extension", "")

View File

@@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
- Email body content
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View File

@@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")

View File

@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View File

@@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
):

View File

@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:

View File

@@ -1,5 +1,5 @@
from typing import Union
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._media_converter import MediaConverter
# Optional Transcription support
@@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a WAV
extension = kwargs.get("file_extension", "")

View File

@@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View File

@@ -2,7 +2,7 @@ from typing import Union
import pandas as pd
from ._base import DocumentConverterResult
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
@@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")

View File

@@ -19,6 +19,11 @@ except ModuleNotFoundError:
class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View File

@@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
- Cleans up temporary files after processing
"""
def __init__(
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
):
super().__init__(priority=priority)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:

View File

@@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_remote()
test_markitdown_local()
test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm()
print("All tests passed!")