Exploring ways to allow Optional dependencies (#1079)

* Enable optional dependencies. Starting with pptx.
* Fix CLI tests.... have them install [all]
* Added .docx to optional dependencies
* Reuse error messages for missing dependencies.
* Added xlsx and xls
* Added pdfs
* Added Ole files.
* Updated READMEs, and finished remaining feature-categories.
* Move OpenAI to hatch-test environment.
This commit is contained in:
afourney
2025-03-03 09:06:19 -08:00
committed by GitHub
parent f01c6c5277
commit c5cd659f63
14 changed files with 254 additions and 45 deletions

View File

@@ -10,7 +10,7 @@
From PyPI:
```bash
pip install markitdown
pip install markitdown[all]
```
From source:
@@ -18,7 +18,7 @@ From source:
```bash
git clone git@github.com:microsoft/markitdown.git
cd markitdown
pip install -e packages/markitdown
pip install -e packages/markitdown[all]
```
## Usage

View File

@@ -26,25 +26,36 @@ classifiers = [
dependencies = [
"beautifulsoup4",
"requests",
"mammoth",
"markdownify~=0.14.1",
"numpy",
"puremagic",
"pathvalidate",
"charset-normalizer",
]
[project.optional-dependencies]
all = [
"python-pptx",
"mammoth",
"pandas",
"openpyxl",
"xlrd",
"pdfminer.six",
"puremagic",
"pydub",
"olefile",
"youtube-transcript-api",
"pydub",
"SpeechRecognition",
"pathvalidate",
"charset-normalizer",
"openai",
"youtube-transcript-api",
"azure-ai-documentintelligence",
"azure-identity"
]
pptx = ["python-pptx"]
docx = ["mammoth"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
outlook = ["olefile"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
[project.urls]
Documentation = "https://github.com/microsoft/markitdown#readme"
@@ -57,6 +68,15 @@ path = "src/markitdown/__about__.py"
[project.scripts]
markitdown = "markitdown.__main__:main"
[tool.hatch.envs.default]
features = ["all"]
[tool.hatch.envs.hatch-test]
features = ["all"]
extra-dependencies = [
"openai",
]
[tool.hatch.envs.types]
extra-dependencies = [
"mypy>=1.0.0",

View File

@@ -6,7 +6,7 @@ from .__about__ import __version__
from ._markitdown import MarkItDown
from ._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
MissingDependencyException,
FailedConversionAttempt,
FileConversionException,
UnsupportedFormatException,
@@ -19,7 +19,7 @@ __all__ = [
"DocumentConverter",
"DocumentConverterResult",
"MarkItDownException",
"ConverterPrerequisiteException",
"MissingDependencyException",
"FailedConversionAttempt",
"FileConversionException",
"UnsupportedFormatException",

View File

@@ -1,5 +1,12 @@
from typing import Optional, List, Any
MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example:
* pip install markitdown[{feature}]
* pip install markitdown[all]
* pip install markitdown[{feature}, ...]
* etc."""
class MarkItDownException(Exception):
"""
@@ -9,15 +16,16 @@ class MarkItDownException(Exception):
pass
class ConverterPrerequisiteException(MarkItDownException):
class MissingDependencyException(MarkItDownException):
"""
Thrown when instantiating a DocumentConverter in cases where
a required library or dependency is not installed, an API key
is not set, or some other prerequisite is not met.
Converters shipped with MarkItDown may depend on optional
dependencies. This exception is thrown when a converter's
convert() method is called, but the required dependency is not
installed. This is not necessarily a fatal error, as the converter
will simply be skipped (an error will bubble up only if no other
suitable converter is found).
This is not necessarily a fatal error. If thrown during
MarkItDown's plugin loading phase, the converter will simply be
skipped, and a warning will be issued.
Error messages should clearly indicate which dependency is missing.
"""
pass

View File

@@ -42,7 +42,6 @@ from .converters import (
from ._exceptions import (
FileConversionException,
UnsupportedFormatException,
ConverterPrerequisiteException,
FailedConversionAttempt,
)

View File

@@ -1,16 +1,24 @@
from typing import Any, Union
import re
# Azure imports
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
import sys
from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
@@ -30,6 +38,16 @@ class DocumentIntelligenceConverter(DocumentConverter):
):
super().__init__(priority=priority)
# Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated
# unless explicitly requested.
if _dependency_exc_info is not None:
raise MissingDependencyException(
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(

View File

@@ -1,6 +1,6 @@
from typing import Union
import sys
import mammoth
from typing import Union
from ._base import (
DocumentConverterResult,
@@ -8,6 +8,16 @@ from ._base import (
from ._base import DocumentConverter
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import mammoth
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class DocxConverter(HtmlConverter):
@@ -26,6 +36,18 @@ class DocxConverter(HtmlConverter):
if extension.lower() != ".docx":
return None
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".docx",
feature="docx",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)

View File

@@ -7,7 +7,7 @@ import mimetypes
class ImageConverter(MediaConverter):
"""
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""
def __init__(

View File

@@ -1,6 +1,16 @@
import olefile
import sys
from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import olefile
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class OutlookMsgConverter(DocumentConverter):
@@ -24,6 +34,18 @@ class OutlookMsgConverter(DocumentConverter):
if extension.lower() != ".msg":
return None
# Check: the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".msg",
feature="outlook",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
try:
msg = olefile.OleFileIO(local_path)
# Extract email metadata
@@ -59,10 +81,12 @@ class OutlookMsgConverter(DocumentConverter):
f"Could not convert MSG file '{local_path}': {str(e)}"
)
def _get_stream_data(
self, msg: olefile.OleFileIO, stream_path: str
) -> Union[str, None]:
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file."""
assert isinstance(
msg, olefile.OleFileIO
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
try:
if msg.exists(stream_path):
data = msg.openstream(stream_path).read()

View File

@@ -1,7 +1,17 @@
import pdfminer
import pdfminer.high_level
import sys
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class PdfConverter(DocumentConverter):
@@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
if extension.lower() != ".pdf":
return None
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
return DocumentConverterResult(
title=None,
text_content=pdfminer.high_level.extract_text(local_path),

View File

@@ -6,6 +6,13 @@ from typing import Any, Union
from ._base import DocumentConverter, DocumentConverterResult
# Mimetypes to ignore (commonly confused extensions)
IGNORE_MIMETYPES = [
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
]
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
@@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
"__placeholder" + kwargs.get("file_extension", "")
)
# Ignore common false positives
if content_type in IGNORE_MIMETYPES:
content_type = None
# Only accept text files
if content_type is None:
return None

View File

@@ -1,12 +1,22 @@
import base64
import pptx
import re
import html
import sys
from typing import Union
from ._base import DocumentConverterResult, DocumentConverter
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import pptx
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
class PptxConverter(HtmlConverter):
@@ -54,9 +64,20 @@ class PptxConverter(HtmlConverter):
if extension.lower() != ".pptx":
return None
md_content = ""
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pptx",
feature="pptx",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # Restore the original traceback
presentation = pptx.Presentation(local_path)
md_content = ""
slide_num = 0
for slide in presentation.slides:
slide_num += 1

View File

@@ -1,9 +1,26 @@
from typing import Union
import sys
import pandas as pd
from typing import Union
from ._base import DocumentConverter, DocumentConverterResult
from ._html_converter import HtmlConverter
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()
_xls_dependency_exc_info = None
try:
import pandas as pd
import xlrd
except ImportError:
_xls_dependency_exc_info = sys.exc_info()
class XlsxConverter(HtmlConverter):
@@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
if extension.lower() != ".xlsx":
return None
# Check the dependencies
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xlsx",
feature="xlsx",
)
) from _xlsx_dependency_exc_info[1].with_traceback(
_xlsx_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
md_content = ""
for s in sheets:
@@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
if extension.lower() != ".xls":
return None
# Load the dependencies
if _xls_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xls",
feature="xls",
)
) from _xls_dependency_exc_info[1].with_traceback(
_xls_dependency_exc_info[2]
) # Restore the original traceback
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
md_content = ""
for s in sheets: