diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index b7ac5bc..297f554 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -47,10 +47,6 @@ from ._exceptions import ( # Override mimetype for csv to fix issue on windows mimetypes.add_type("text/csv", ".csv") -PRIORITY_SPECIFIC_FILE_FORMAT = 0.0 -PRIORITY_GENERIC_FILE_FORMAT = 10.0 - - _plugins: Union[None | List[Any]] = None @@ -123,6 +119,8 @@ class MarkItDown: self._llm_model = kwargs.get("llm_model") self._exiftool_path = kwargs.get("exiftool_path") self._style_map = kwargs.get("style_map") + if self._exiftool_path is None: + self._exiftool_path = os.getenv("EXIFTOOL_PATH") # Register converters for successful browsing operations # Later registrations are tried first / take higher priority than earlier registrations @@ -349,11 +347,10 @@ class MarkItDown: _kwargs["_parent_converters"] = self._page_converters # If we hit an error log it and keep trying - # try: - if True: + try: res = converter.convert(local_path, **_kwargs) - # except Exception: - # error_trace = ("\n\n" + traceback.format_exc()).strip() + except Exception: + error_trace = ("\n\n" + traceback.format_exc()).strip() if res is not None: # Normalize the content diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py index 6d0a5a4..3947797 100644 --- a/packages/markitdown/src/markitdown/converters/_base.py +++ b/packages/markitdown/src/markitdown/converters/_base.py @@ -12,7 +12,36 @@ class DocumentConverterResult: class DocumentConverter: """Abstract superclass of all DocumentConverters.""" - def __init__(self, priority: float = 0.0): + # Lower priority values are tried first. + PRIORITY_SPECIFIC_FILE_FORMAT = ( + 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia + ) + PRIORITY_GENERIC_FILE_FORMAT = ( + 10.0 # Near catch-all converters for mimetypes like text/*, etc. + ) + + def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT): + """ + Initialize the DocumentConverter with a given priority. + + Priorities work as follows: By default, most converters get priority + DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception + is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), + with lower values being tried first (i.e., higher priority). + + Just prior to conversion, the converters are sorted by priority, using + a stable sort. This means that converters with the same priority will + remain in the same order, with the most recently registered converters + appearing first. + + We have tight control over the order of built-in converters, but + plugins can register converters in any order. A converter's priority + field reasserts some control over the order of converters. + + Plugins can register converters with any priority, to appear before or + after the built-ins. For example, a plugin with priority 9 will run + before the PlainTextConverter, but after the built-in converters. + """ self._priority = priority def convert( diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index b903724..d1b11a6 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter): NOTE: It is better to use the Bing API """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a Bing SERP extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 94acc9f..835345a 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter): def __init__( self, + *, + priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT, endpoint: str, api_version: str = "2024-07-31-preview", ): + super().__init__(priority=priority) + self.endpoint = endpoint self.api_version = api_version self.doc_intel_client = DocumentIntelligenceClient( diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index fb61cca..8515f6d 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -6,6 +6,7 @@ from ._base import ( DocumentConverterResult, ) +from ._base import DocumentConverter from ._html_converter import HtmlConverter @@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter): Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a DOCX extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index ae7259e..68c2536 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -8,6 +8,11 @@ from ._markdownify import _CustomMarkdownify class HtmlConverter(DocumentConverter): """Anything with content type text/html""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py index f3dee6b..a46b67c 100644 --- a/packages/markitdown/src/markitdown/converters/_image_converter.py +++ b/packages/markitdown/src/markitdown/converters/_image_converter.py @@ -1,5 +1,5 @@ from typing import Union -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter @@ -8,6 +8,11 @@ class ImageConverter(MediaConverter): Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not an image extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py index cdeb478..b487f41 100644 --- a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py +++ b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py @@ -12,6 +12,11 @@ from .._exceptions import FileConversionException class IpynbConverter(DocumentConverter): """Converts Jupyter Notebook (.ipynb) files to Markdown.""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_media_converter.py b/packages/markitdown/src/markitdown/converters/_media_converter.py index 07d2bde..5c7d82b 100644 --- a/packages/markitdown/src/markitdown/converters/_media_converter.py +++ b/packages/markitdown/src/markitdown/converters/_media_converter.py @@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter): Abstract class for multi-modal media (e.g., images and audio) """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def _get_metadata(self, local_path, exiftool_path=None): if not exiftool_path: which_exiftool = shutil.which("exiftool") @@ -27,10 +32,10 @@ This warning will be removed in future releases. return None else: - try: + if True: result = subprocess.run( [exiftool_path, "-json", local_path], capture_output=True, text=True ).stdout return json.loads(result)[0] - except Exception: - return None + # except Exception: + # return None diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py index 6b2786b..91fd270 100644 --- a/packages/markitdown/src/markitdown/converters/_mp3_converter.py +++ b/packages/markitdown/src/markitdown/converters/_mp3_converter.py @@ -1,6 +1,6 @@ import tempfile from typing import Union -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._wav_converter import WavConverter from warnings import resetwarnings, catch_warnings @@ -28,6 +28,11 @@ class Mp3Converter(WavConverter): Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a MP3 extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py index e83001c..6764fc5 100644 --- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py +++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py @@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter): - Email body content """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index dcffc62..3a2b671 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter): Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a PDF extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 2912d24..75f74a8 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult class PlainTextConverter(DocumentConverter): """Anything with content type text/plain""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index a48880a..afb37a0 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter): Converts PPTX files to Markdown. Supports heading, tables and images with alt text. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def _get_llm_description( self, llm_client, llm_model, image_blob, content_type, prompt=None ): diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index eb2f09c..b279c85 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -9,6 +9,11 @@ from ._base import DocumentConverter, DocumentConverterResult class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py index 6fc8932..3c8d842 100644 --- a/packages/markitdown/src/markitdown/converters/_wav_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wav_converter.py @@ -1,5 +1,5 @@ from typing import Union -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._media_converter import MediaConverter # Optional Transcription support @@ -17,6 +17,11 @@ class WavConverter(MediaConverter): Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a WAV extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 4097ef0..f27fe23 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -10,6 +10,11 @@ from ._markdownify import _CustomMarkdownify class WikipediaConverter(DocumentConverter): """Handle Wikipedia pages separately, focusing only on the main document content.""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 683349c..2bdfd5d 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -2,7 +2,7 @@ from typing import Union import pandas as pd -from ._base import DocumentConverterResult +from ._base import DocumentConverter, DocumentConverterResult from ._html_converter import HtmlConverter @@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter): Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Bail if not a XLSX extension = kwargs.get("file_extension", "") diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py index fe198e8..b961b88 100644 --- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py +++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py @@ -19,6 +19,11 @@ except ModuleNotFoundError: class YouTubeConverter(DocumentConverter): """Handle YouTube specially, focusing on the video title, description, and transcript.""" + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 918c357..026900d 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter): - Cleans up temporary files after processing """ + def __init__( + self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT + ): + super().__init__(priority=priority) + def convert( self, local_path: str, **kwargs: Any ) -> Union[None, DocumentConverterResult]: diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py index be71722..efd45ac 100644 --- a/packages/markitdown/tests/test_markitdown.py +++ b/packages/markitdown/tests/test_markitdown.py @@ -327,8 +327,8 @@ def test_markitdown_llm() -> None: if __name__ == "__main__": """Runs this file's tests from the command line.""" - # test_markitdown_remote() - # test_markitdown_local() + test_markitdown_remote() + test_markitdown_local() test_markitdown_exiftool() - # test_markitdown_deprecation() # test_markitdown_llm() + print("All tests passed!")