Exploring ways to allow Optional dependencies (#1079)

* Enable optional dependencies. Starting with pptx. * Fix CLI tests.... have them install [all] * Added .docx to optional dependencies * Reuse error messages for missing dependencies. * Added xlsx and xls * Added pdfs * Added Ole files. * Updated READMEs, and finished remaining feature-categories. * Move OpenAI to hatch-test environment.
2025-03-03 09:06:19 -08:00
parent f01c6c5277
commit c5cd659f63
14 changed files with 254 additions and 45 deletions
--- a/packages/markitdown/README.md
+++ b/packages/markitdown/README.md
@@ -10,7 +10,7 @@
 From PyPI:

 ```bash
-pip install markitdown
+pip install markitdown[all]
 ```

 From source:
@@ -18,7 +18,7 @@ From source:
 ```bash
 git clone git@github.com:microsoft/markitdown.git
 cd markitdown
-pip install -e packages/markitdown
+pip install -e packages/markitdown[all]
 ```

 ## Usage
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -26,25 +26,36 @@ classifiers = [
 dependencies = [
  "beautifulsoup4",
  "requests",
-  "mammoth",
  "markdownify~=0.14.1",
-  "numpy",
+  "puremagic",
+  "pathvalidate",
+  "charset-normalizer",
+]
+
+[project.optional-dependencies]
+all = [
  "python-pptx",
+  "mammoth",
  "pandas",
  "openpyxl",
  "xlrd",
  "pdfminer.six",
-  "puremagic",
-  "pydub",
  "olefile",
-  "youtube-transcript-api",
+  "pydub",
  "SpeechRecognition",
-  "pathvalidate",
-  "charset-normalizer",
-  "openai",
+  "youtube-transcript-api",
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
+pptx = ["python-pptx"]
+docx = ["mammoth"]
+xlsx = ["pandas", "openpyxl"]
+xls = ["pandas", "xlrd"]
+pdf = ["pdfminer.six"]
+outlook = ["olefile"]
+audio-transcription = ["pydub", "SpeechRecognition"]
+youtube-transcription = ["youtube-transcript-api"]
+az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]

 [project.urls]
 Documentation = "https://github.com/microsoft/markitdown#readme"
@@ -57,6 +68,15 @@ path = "src/markitdown/__about__.py"
 [project.scripts]
 markitdown = "markitdown.__main__:main"

+[tool.hatch.envs.default]
+features = ["all"]
+
+[tool.hatch.envs.hatch-test]
+features = ["all"]
+extra-dependencies = [
+  "openai",
+]
+
 [tool.hatch.envs.types]
 extra-dependencies = [
  "mypy>=1.0.0",
--- a/packages/markitdown/src/markitdown/init.py
+++ b/packages/markitdown/src/markitdown/init.py
@@ -6,7 +6,7 @@ from .__about__ import __version__
 from ._markitdown import MarkItDown
 from ._exceptions import (
    MarkItDownException,
-    ConverterPrerequisiteException,
+    MissingDependencyException,
    FailedConversionAttempt,
    FileConversionException,
    UnsupportedFormatException,
@@ -19,7 +19,7 @@ __all__ = [
    "DocumentConverter",
    "DocumentConverterResult",
    "MarkItDownException",
-    "ConverterPrerequisiteException",
+    "MissingDependencyException",
    "FailedConversionAttempt",
    "FileConversionException",
    "UnsupportedFormatException",
--- a/packages/markitdown/src/markitdown/_exceptions.py
+++ b/packages/markitdown/src/markitdown/_exceptions.py
@@ -1,5 +1,12 @@
 from typing import Optional, List, Any

+MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example:
+
+* pip install markitdown[{feature}]
+* pip install markitdown[all]
+* pip install markitdown[{feature}, ...]
+* etc."""
+

 class MarkItDownException(Exception):
    """
@@ -9,15 +16,16 @@ class MarkItDownException(Exception):
    pass


-class ConverterPrerequisiteException(MarkItDownException):
+class MissingDependencyException(MarkItDownException):
    """
-    Thrown when instantiating a DocumentConverter in cases where
-    a required library or dependency is not installed, an API key
-    is not set, or some other prerequisite is not met.
+    Converters shipped with MarkItDown may depend on optional
+    dependencies. This exception is thrown when a converter's
+    convert() method is called, but the required dependency is not
+    installed. This is not necessarily a fatal error, as the converter
+    will simply be skipped (an error will bubble up only if no other
+    suitable converter is found).

-    This is not necessarily a fatal error. If thrown during
-    MarkItDown's plugin loading phase, the converter will simply be
-    skipped, and a warning will be issued.
+    Error messages should clearly indicate which dependency is missing.
    """

    pass
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -42,7 +42,6 @@ from .converters import (
 from ._exceptions import (
    FileConversionException,
    UnsupportedFormatException,
-    ConverterPrerequisiteException,
    FailedConversionAttempt,
 )

--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,16 +1,24 @@
 from typing import Any, Union
 import re
-
-# Azure imports
-from azure.ai.documentintelligence import DocumentIntelligenceClient
-from azure.ai.documentintelligence.models import (
-    AnalyzeDocumentRequest,
-    AnalyzeResult,
-    DocumentAnalysisFeature,
-)
-from azure.identity import DefaultAzureCredential
+import sys

 from ._base import DocumentConverter, DocumentConverterResult
+from .._exceptions import MissingDependencyException
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_dependency_exc_info = None
+try:
+    from azure.ai.documentintelligence import DocumentIntelligenceClient
+    from azure.ai.documentintelligence.models import (
+        AnalyzeDocumentRequest,
+        AnalyzeResult,
+        DocumentAnalysisFeature,
+    )
+    from azure.identity import DefaultAzureCredential
+except ImportError:
+    # Preserve the error and stack trace for later
+    _dependency_exc_info = sys.exc_info()


 # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@@ -30,6 +38,16 @@ class DocumentIntelligenceConverter(DocumentConverter):
    ):
        super().__init__(priority=priority)

+        # Raise an error if the dependencies are not available.
+        # This is different than other converters since this one isn't even instantiated
+        # unless explicitly requested.
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
+            ) from _dependency_exc_info[1].with_traceback(
+                _dependency_exc_info[2]
+            )  # Restore the original traceback
+
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -1,6 +1,6 @@
-from typing import Union
+import sys

-import mammoth
+from typing import Union

 from ._base import (
    DocumentConverterResult,
@@ -8,6 +8,16 @@ from ._base import (

 from ._base import DocumentConverter
 from ._html_converter import HtmlConverter
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_dependency_exc_info = None
+try:
+    import mammoth
+except ImportError:
+    # Preserve the error and stack trace for later
+    _dependency_exc_info = sys.exc_info()


 class DocxConverter(HtmlConverter):
@@ -26,6 +36,18 @@ class DocxConverter(HtmlConverter):
        if extension.lower() != ".docx":
            return None

+        # Check: the dependencies
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".docx",
+                    feature="docx",
+                )
+            ) from _dependency_exc_info[1].with_traceback(
+                _dependency_exc_info[2]
+            )  # Restore the original traceback
+
        result = None
        with open(local_path, "rb") as docx_file:
            style_map = kwargs.get("style_map", None)
--- a/packages/markitdown/src/markitdown/converters/_image_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -7,7 +7,7 @@ import mimetypes

 class ImageConverter(MediaConverter):
    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """

    def __init__(
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -1,6 +1,16 @@
-import olefile
+import sys
 from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_dependency_exc_info = None
+try:
+    import olefile
+except ImportError:
+    # Preserve the error and stack trace for later
+    _dependency_exc_info = sys.exc_info()


 class OutlookMsgConverter(DocumentConverter):
@@ -24,6 +34,18 @@ class OutlookMsgConverter(DocumentConverter):
        if extension.lower() != ".msg":
            return None

+        # Check: the dependencies
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".msg",
+                    feature="outlook",
+                )
+            ) from _dependency_exc_info[1].with_traceback(
+                _dependency_exc_info[2]
+            )  # Restore the original traceback
+
        try:
            msg = olefile.OleFileIO(local_path)
            # Extract email metadata
@@ -59,10 +81,12 @@ class OutlookMsgConverter(DocumentConverter):
                f"Could not convert MSG file '{local_path}': {str(e)}"
            )

-    def _get_stream_data(
-        self, msg: olefile.OleFileIO, stream_path: str
-    ) -> Union[str, None]:
+    def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
        """Helper to safely extract and decode stream data from the MSG file."""
+        assert isinstance(
+            msg, olefile.OleFileIO
+        )  # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
+
        try:
            if msg.exists(stream_path):
                data = msg.openstream(stream_path).read()
--- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -1,7 +1,17 @@
-import pdfminer
-import pdfminer.high_level
+import sys
 from typing import Union
 from ._base import DocumentConverter, DocumentConverterResult
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_dependency_exc_info = None
+try:
+    import pdfminer
+    import pdfminer.high_level
+except ImportError:
+    # Preserve the error and stack trace for later
+    _dependency_exc_info = sys.exc_info()


 class PdfConverter(DocumentConverter):
@@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
        if extension.lower() != ".pdf":
            return None

+        # Check the dependencies
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".pdf",
+                    feature="pdf",
+                )
+            ) from _dependency_exc_info[1].with_traceback(
+                _dependency_exc_info[2]
+            )  # Restore the original traceback
+
        return DocumentConverterResult(
            title=None,
            text_content=pdfminer.high_level.extract_text(local_path),
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -6,6 +6,13 @@ from typing import Any, Union
 from ._base import DocumentConverter, DocumentConverterResult


+# Mimetypes to ignore (commonly confused extensions)
+IGNORE_MIMETYPES = [
+    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
+    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
+]
+
+
 class PlainTextConverter(DocumentConverter):
    """Anything with content type text/plain"""

@@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
            "__placeholder" + kwargs.get("file_extension", "")
        )

+        # Ignore common false positives
+        if content_type in IGNORE_MIMETYPES:
+            content_type = None
+
        # Only accept text files
        if content_type is None:
            return None
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -1,12 +1,22 @@
 import base64
-import pptx
 import re
 import html
+import sys

 from typing import Union

 from ._base import DocumentConverterResult, DocumentConverter
 from ._html_converter import HtmlConverter
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_dependency_exc_info = None
+try:
+    import pptx
+except ImportError:
+    # Preserve the error and stack trace for later
+    _dependency_exc_info = sys.exc_info()


 class PptxConverter(HtmlConverter):
@@ -54,9 +64,20 @@ class PptxConverter(HtmlConverter):
        if extension.lower() != ".pptx":
            return None

-        md_content = ""
+        # Check the dependencies
+        if _dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".pptx",
+                    feature="pptx",
+                )
+            ) from _dependency_exc_info[1].with_traceback(
+                _dependency_exc_info[2]
+            )  # Restore the original traceback

        presentation = pptx.Presentation(local_path)
+        md_content = ""
        slide_num = 0
        for slide in presentation.slides:
            slide_num += 1
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -1,9 +1,26 @@
-from typing import Union
+import sys

-import pandas as pd
+from typing import Union

 from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
+from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+
+# Try loading optional (but in this case, required) dependencies
+# Save reporting of any exceptions for later
+_xlsx_dependency_exc_info = None
+try:
+    import pandas as pd
+    import openpyxl
+except ImportError:
+    _xlsx_dependency_exc_info = sys.exc_info()
+
+_xls_dependency_exc_info = None
+try:
+    import pandas as pd
+    import xlrd
+except ImportError:
+    _xls_dependency_exc_info = sys.exc_info()


 class XlsxConverter(HtmlConverter):
@@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
        if extension.lower() != ".xlsx":
            return None

+        # Check the dependencies
+        if _xlsx_dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".xlsx",
+                    feature="xlsx",
+                )
+            ) from _xlsx_dependency_exc_info[1].with_traceback(
+                _xlsx_dependency_exc_info[2]
+            )  # Restore the original traceback
+
        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
        md_content = ""
        for s in sheets:
@@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
        if extension.lower() != ".xls":
            return None

+        # Load the dependencies
+        if _xls_dependency_exc_info is not None:
+            raise MissingDependencyException(
+                MISSING_DEPENDENCY_MESSAGE.format(
+                    converter=type(self).__name__,
+                    extension=".xls",
+                    feature="xls",
+                )
+            ) from _xls_dependency_exc_info[1].with_traceback(
+                _xls_dependency_exc_info[2]
+            )  # Restore the original traceback
+
        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
        md_content = ""
        for s in sheets: