Added priority argument to all converter constructors. (#324)
* Added priority argument to all converter constructors.
This commit is contained in:
@@ -47,10 +47,6 @@ from ._exceptions import (
|
||||
# Override mimetype for csv to fix issue on windows
|
||||
mimetypes.add_type("text/csv", ".csv")
|
||||
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
|
||||
PRIORITY_GENERIC_FILE_FORMAT = 10.0
|
||||
|
||||
|
||||
_plugins: Union[None | List[Any]] = None
|
||||
|
||||
|
||||
@@ -123,6 +119,8 @@ class MarkItDown:
|
||||
self._llm_model = kwargs.get("llm_model")
|
||||
self._exiftool_path = kwargs.get("exiftool_path")
|
||||
self._style_map = kwargs.get("style_map")
|
||||
if self._exiftool_path is None:
|
||||
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
||||
|
||||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
@@ -349,11 +347,10 @@ class MarkItDown:
|
||||
_kwargs["_parent_converters"] = self._page_converters
|
||||
|
||||
# If we hit an error log it and keep trying
|
||||
# try:
|
||||
if True:
|
||||
try:
|
||||
res = converter.convert(local_path, **_kwargs)
|
||||
# except Exception:
|
||||
# error_trace = ("\n\n" + traceback.format_exc()).strip()
|
||||
except Exception:
|
||||
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
||||
|
||||
if res is not None:
|
||||
# Normalize the content
|
||||
|
||||
@@ -12,7 +12,36 @@ class DocumentConverterResult:
|
||||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
def __init__(self, priority: float = 0.0):
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
||||
"""
|
||||
Initialize the DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
||||
with lower values being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. A converter's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._priority = priority
|
||||
|
||||
def convert(
|
||||
|
||||
@@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
|
||||
NOTE: It is better to use the Bing API
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a Bing SERP
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
||||
@@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
endpoint: str,
|
||||
api_version: str = "2024-07-31-preview",
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.doc_intel_client = DocumentIntelligenceClient(
|
||||
|
||||
@@ -6,6 +6,7 @@ from ._base import (
|
||||
DocumentConverterResult,
|
||||
)
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
|
||||
|
||||
@@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
|
||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a DOCX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
||||
@@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify
|
||||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from typing import Union
|
||||
from ._base import DocumentConverterResult
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
|
||||
|
||||
@@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not an image
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
||||
@@ -12,6 +12,11 @@ from .._exceptions import FileConversionException
|
||||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
|
||||
Abstract class for multi-modal media (e.g., images and audio)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def _get_metadata(self, local_path, exiftool_path=None):
|
||||
if not exiftool_path:
|
||||
which_exiftool = shutil.which("exiftool")
|
||||
@@ -27,10 +32,10 @@ This warning will be removed in future releases.
|
||||
|
||||
return None
|
||||
else:
|
||||
try:
|
||||
if True:
|
||||
result = subprocess.run(
|
||||
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
||||
).stdout
|
||||
return json.loads(result)[0]
|
||||
except Exception:
|
||||
return None
|
||||
# except Exception:
|
||||
# return None
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import tempfile
|
||||
from typing import Union
|
||||
from ._base import DocumentConverterResult
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._wav_converter import WavConverter
|
||||
from warnings import resetwarnings, catch_warnings
|
||||
|
||||
@@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
|
||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a MP3
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
||||
@@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
- Email body content
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
|
||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a PDF
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
||||
@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
|
||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def _get_llm_description(
|
||||
self, llm_client, llm_model, image_blob, content_type, prompt=None
|
||||
):
|
||||
|
||||
@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
|
||||
class RssConverter(DocumentConverter):
|
||||
"""Convert RSS / Atom type to markdown"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from typing import Union
|
||||
from ._base import DocumentConverterResult
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._media_converter import MediaConverter
|
||||
|
||||
# Optional Transcription support
|
||||
@@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
|
||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a WAV
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
||||
@@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify
|
||||
class WikipediaConverter(DocumentConverter):
|
||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -2,7 +2,7 @@ from typing import Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from ._base import DocumentConverterResult
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
|
||||
|
||||
@@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
|
||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a XLSX
|
||||
extension = kwargs.get("file_extension", "")
|
||||
|
||||
@@ -19,6 +19,11 @@ except ModuleNotFoundError:
|
||||
class YouTubeConverter(DocumentConverter):
|
||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
|
||||
- Cleans up temporary files after processing
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def convert(
|
||||
self, local_path: str, **kwargs: Any
|
||||
) -> Union[None, DocumentConverterResult]:
|
||||
|
||||
@@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
# test_markitdown_remote()
|
||||
# test_markitdown_local()
|
||||
test_markitdown_remote()
|
||||
test_markitdown_local()
|
||||
test_markitdown_exiftool()
|
||||
# test_markitdown_deprecation()
|
||||
# test_markitdown_llm()
|
||||
print("All tests passed!")
|
||||
|
||||
Reference in New Issue
Block a user