Added priority argument to all converter constructors. (#324)
* Added priority argument to all converter constructors.
This commit is contained in:
@@ -47,10 +47,6 @@ from ._exceptions import (
|
|||||||
# Override mimetype for csv to fix issue on windows
|
# Override mimetype for csv to fix issue on windows
|
||||||
mimetypes.add_type("text/csv", ".csv")
|
mimetypes.add_type("text/csv", ".csv")
|
||||||
|
|
||||||
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
|
|
||||||
PRIORITY_GENERIC_FILE_FORMAT = 10.0
|
|
||||||
|
|
||||||
|
|
||||||
_plugins: Union[None | List[Any]] = None
|
_plugins: Union[None | List[Any]] = None
|
||||||
|
|
||||||
|
|
||||||
@@ -123,6 +119,8 @@ class MarkItDown:
|
|||||||
self._llm_model = kwargs.get("llm_model")
|
self._llm_model = kwargs.get("llm_model")
|
||||||
self._exiftool_path = kwargs.get("exiftool_path")
|
self._exiftool_path = kwargs.get("exiftool_path")
|
||||||
self._style_map = kwargs.get("style_map")
|
self._style_map = kwargs.get("style_map")
|
||||||
|
if self._exiftool_path is None:
|
||||||
|
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
||||||
|
|
||||||
# Register converters for successful browsing operations
|
# Register converters for successful browsing operations
|
||||||
# Later registrations are tried first / take higher priority than earlier registrations
|
# Later registrations are tried first / take higher priority than earlier registrations
|
||||||
@@ -349,11 +347,10 @@ class MarkItDown:
|
|||||||
_kwargs["_parent_converters"] = self._page_converters
|
_kwargs["_parent_converters"] = self._page_converters
|
||||||
|
|
||||||
# If we hit an error log it and keep trying
|
# If we hit an error log it and keep trying
|
||||||
# try:
|
try:
|
||||||
if True:
|
|
||||||
res = converter.convert(local_path, **_kwargs)
|
res = converter.convert(local_path, **_kwargs)
|
||||||
# except Exception:
|
except Exception:
|
||||||
# error_trace = ("\n\n" + traceback.format_exc()).strip()
|
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
# Normalize the content
|
# Normalize the content
|
||||||
|
|||||||
@@ -12,7 +12,36 @@ class DocumentConverterResult:
|
|||||||
class DocumentConverter:
|
class DocumentConverter:
|
||||||
"""Abstract superclass of all DocumentConverters."""
|
"""Abstract superclass of all DocumentConverters."""
|
||||||
|
|
||||||
def __init__(self, priority: float = 0.0):
|
# Lower priority values are tried first.
|
||||||
|
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||||
|
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||||
|
)
|
||||||
|
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||||
|
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
|
||||||
|
"""
|
||||||
|
Initialize the DocumentConverter with a given priority.
|
||||||
|
|
||||||
|
Priorities work as follows: By default, most converters get priority
|
||||||
|
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||||
|
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
|
||||||
|
with lower values being tried first (i.e., higher priority).
|
||||||
|
|
||||||
|
Just prior to conversion, the converters are sorted by priority, using
|
||||||
|
a stable sort. This means that converters with the same priority will
|
||||||
|
remain in the same order, with the most recently registered converters
|
||||||
|
appearing first.
|
||||||
|
|
||||||
|
We have tight control over the order of built-in converters, but
|
||||||
|
plugins can register converters in any order. A converter's priority
|
||||||
|
field reasserts some control over the order of converters.
|
||||||
|
|
||||||
|
Plugins can register converters with any priority, to appear before or
|
||||||
|
after the built-ins. For example, a plugin with priority 9 will run
|
||||||
|
before the PlainTextConverter, but after the built-in converters.
|
||||||
|
"""
|
||||||
self._priority = priority
|
self._priority = priority
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
|
|||||||
@@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
|
|||||||
NOTE: It is better to use the Bing API
|
NOTE: It is better to use the Bing API
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a Bing SERP
|
# Bail if not a Bing SERP
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
*,
|
||||||
|
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||||
endpoint: str,
|
endpoint: str,
|
||||||
api_version: str = "2024-07-31-preview",
|
api_version: str = "2024-07-31-preview",
|
||||||
):
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
self.endpoint = endpoint
|
self.endpoint = endpoint
|
||||||
self.api_version = api_version
|
self.api_version = api_version
|
||||||
self.doc_intel_client = DocumentIntelligenceClient(
|
self.doc_intel_client = DocumentIntelligenceClient(
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from ._base import (
|
|||||||
DocumentConverterResult,
|
DocumentConverterResult,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from ._base import DocumentConverter
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
|
||||||
|
|
||||||
@@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
|
|||||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a DOCX
|
# Bail if not a DOCX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify
|
|||||||
class HtmlConverter(DocumentConverter):
|
class HtmlConverter(DocumentConverter):
|
||||||
"""Anything with content type text/html"""
|
"""Anything with content type text/html"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
|
||||||
|
|
||||||
@@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
|
|||||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not an image
|
# Bail if not an image
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -12,6 +12,11 @@ from .._exceptions import FileConversionException
|
|||||||
class IpynbConverter(DocumentConverter):
|
class IpynbConverter(DocumentConverter):
|
||||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
|
|||||||
Abstract class for multi-modal media (e.g., images and audio)
|
Abstract class for multi-modal media (e.g., images and audio)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def _get_metadata(self, local_path, exiftool_path=None):
|
def _get_metadata(self, local_path, exiftool_path=None):
|
||||||
if not exiftool_path:
|
if not exiftool_path:
|
||||||
which_exiftool = shutil.which("exiftool")
|
which_exiftool = shutil.which("exiftool")
|
||||||
@@ -27,10 +32,10 @@ This warning will be removed in future releases.
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
try:
|
if True:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
||||||
).stdout
|
).stdout
|
||||||
return json.loads(result)[0]
|
return json.loads(result)[0]
|
||||||
except Exception:
|
# except Exception:
|
||||||
return None
|
# return None
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import tempfile
|
import tempfile
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._wav_converter import WavConverter
|
from ._wav_converter import WavConverter
|
||||||
from warnings import resetwarnings, catch_warnings
|
from warnings import resetwarnings, catch_warnings
|
||||||
|
|
||||||
@@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
|
|||||||
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a MP3
|
# Bail if not a MP3
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
|
|||||||
- Email body content
|
- Email body content
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
|
|||||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a PDF
|
# Bail if not a PDF
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
|
|||||||
class PlainTextConverter(DocumentConverter):
|
class PlainTextConverter(DocumentConverter):
|
||||||
"""Anything with content type text/plain"""
|
"""Anything with content type text/plain"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
|
|||||||
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def _get_llm_description(
|
def _get_llm_description(
|
||||||
self, llm_client, llm_model, image_blob, content_type, prompt=None
|
self, llm_client, llm_model, image_blob, content_type, prompt=None
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult
|
|||||||
class RssConverter(DocumentConverter):
|
class RssConverter(DocumentConverter):
|
||||||
"""Convert RSS / Atom type to markdown"""
|
"""Convert RSS / Atom type to markdown"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs
|
self, local_path: str, **kwargs
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
from ._base import DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._media_converter import MediaConverter
|
from ._media_converter import MediaConverter
|
||||||
|
|
||||||
# Optional Transcription support
|
# Optional Transcription support
|
||||||
@@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
|
|||||||
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a WAV
|
# Bail if not a WAV
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify
|
|||||||
class WikipediaConverter(DocumentConverter):
|
class WikipediaConverter(DocumentConverter):
|
||||||
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ from typing import Union
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from ._base import DocumentConverterResult
|
from ._base import DocumentConverter, DocumentConverterResult
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
|
|
||||||
|
|
||||||
@@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
|
|||||||
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
# Bail if not a XLSX
|
# Bail if not a XLSX
|
||||||
extension = kwargs.get("file_extension", "")
|
extension = kwargs.get("file_extension", "")
|
||||||
|
|||||||
@@ -19,6 +19,11 @@ except ModuleNotFoundError:
|
|||||||
class YouTubeConverter(DocumentConverter):
|
class YouTubeConverter(DocumentConverter):
|
||||||
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
|
|||||||
- Cleans up temporary files after processing
|
- Cleans up temporary files after processing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||||
|
):
|
||||||
|
super().__init__(priority=priority)
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, local_path: str, **kwargs: Any
|
self, local_path: str, **kwargs: Any
|
||||||
) -> Union[None, DocumentConverterResult]:
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
|||||||
@@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
# test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
# test_markitdown_local()
|
test_markitdown_local()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
# test_markitdown_deprecation()
|
|
||||||
# test_markitdown_llm()
|
# test_markitdown_llm()
|
||||||
|
print("All tests passed!")
|
||||||
|
|||||||
Reference in New Issue
Block a user