Exploring ways to allow Optional dependencies (#1079)
* Enable optional dependencies. Starting with pptx. * Fix CLI tests.... have them install [all] * Added .docx to optional dependencies * Reuse error messages for missing dependencies. * Added xlsx and xls * Added pdfs * Added Ole files. * Updated READMEs, and finished remaining feature-categories. * Move OpenAI to hatch-test environment.
This commit is contained in:
@@ -10,7 +10,7 @@
|
||||
From PyPI:
|
||||
|
||||
```bash
|
||||
pip install markitdown
|
||||
pip install markitdown[all]
|
||||
```
|
||||
|
||||
From source:
|
||||
@@ -18,7 +18,7 @@ From source:
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
cd markitdown
|
||||
pip install -e packages/markitdown
|
||||
pip install -e packages/markitdown[all]
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -26,25 +26,36 @@ classifiers = [
|
||||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"mammoth",
|
||||
"markdownify~=0.14.1",
|
||||
"numpy",
|
||||
"puremagic",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
all = [
|
||||
"python-pptx",
|
||||
"mammoth",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"pdfminer.six",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
"olefile",
|
||||
"youtube-transcript-api",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
"openai",
|
||||
"youtube-transcript-api",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
outlook = ["olefile"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
@@ -57,6 +68,15 @@ path = "src/markitdown/__about__.py"
|
||||
[project.scripts]
|
||||
markitdown = "markitdown.__main__:main"
|
||||
|
||||
[tool.hatch.envs.default]
|
||||
features = ["all"]
|
||||
|
||||
[tool.hatch.envs.hatch-test]
|
||||
features = ["all"]
|
||||
extra-dependencies = [
|
||||
"openai",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
"mypy>=1.0.0",
|
||||
|
||||
@@ -6,7 +6,7 @@ from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown
|
||||
from ._exceptions import (
|
||||
MarkItDownException,
|
||||
ConverterPrerequisiteException,
|
||||
MissingDependencyException,
|
||||
FailedConversionAttempt,
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
@@ -19,7 +19,7 @@ __all__ = [
|
||||
"DocumentConverter",
|
||||
"DocumentConverterResult",
|
||||
"MarkItDownException",
|
||||
"ConverterPrerequisiteException",
|
||||
"MissingDependencyException",
|
||||
"FailedConversionAttempt",
|
||||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
from typing import Optional, List, Any
|
||||
|
||||
MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example:
|
||||
|
||||
* pip install markitdown[{feature}]
|
||||
* pip install markitdown[all]
|
||||
* pip install markitdown[{feature}, ...]
|
||||
* etc."""
|
||||
|
||||
|
||||
class MarkItDownException(Exception):
|
||||
"""
|
||||
@@ -9,15 +16,16 @@ class MarkItDownException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ConverterPrerequisiteException(MarkItDownException):
|
||||
class MissingDependencyException(MarkItDownException):
|
||||
"""
|
||||
Thrown when instantiating a DocumentConverter in cases where
|
||||
a required library or dependency is not installed, an API key
|
||||
is not set, or some other prerequisite is not met.
|
||||
Converters shipped with MarkItDown may depend on optional
|
||||
dependencies. This exception is thrown when a converter's
|
||||
convert() method is called, but the required dependency is not
|
||||
installed. This is not necessarily a fatal error, as the converter
|
||||
will simply be skipped (an error will bubble up only if no other
|
||||
suitable converter is found).
|
||||
|
||||
This is not necessarily a fatal error. If thrown during
|
||||
MarkItDown's plugin loading phase, the converter will simply be
|
||||
skipped, and a warning will be issued.
|
||||
Error messages should clearly indicate which dependency is missing.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
@@ -42,7 +42,6 @@ from .converters import (
|
||||
from ._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
ConverterPrerequisiteException,
|
||||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,16 +1,24 @@
|
||||
from typing import Any, Union
|
||||
import re
|
||||
|
||||
# Azure imports
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import (
|
||||
AnalyzeDocumentRequest,
|
||||
AnalyzeResult,
|
||||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.identity import DefaultAzureCredential
|
||||
import sys
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import (
|
||||
AnalyzeDocumentRequest,
|
||||
AnalyzeResult,
|
||||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.identity import DefaultAzureCredential
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
@@ -30,6 +38,16 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
# Raise an error if the dependencies are not available.
|
||||
# This is different than other converters since this one isn't even instantiated
|
||||
# unless explicitly requested.
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.doc_intel_client = DocumentIntelligenceClient(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import Union
|
||||
import sys
|
||||
|
||||
import mammoth
|
||||
from typing import Union
|
||||
|
||||
from ._base import (
|
||||
DocumentConverterResult,
|
||||
@@ -8,6 +8,16 @@ from ._base import (
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class DocxConverter(HtmlConverter):
|
||||
@@ -26,6 +36,18 @@ class DocxConverter(HtmlConverter):
|
||||
if extension.lower() != ".docx":
|
||||
return None
|
||||
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
result = None
|
||||
with open(local_path, "rb") as docx_file:
|
||||
style_map = kwargs.get("style_map", None)
|
||||
|
||||
@@ -7,7 +7,7 @@ import mimetypes
|
||||
|
||||
class ImageConverter(MediaConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -1,6 +1,16 @@
|
||||
import olefile
|
||||
import sys
|
||||
from typing import Any, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import olefile
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class OutlookMsgConverter(DocumentConverter):
|
||||
@@ -24,6 +34,18 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
if extension.lower() != ".msg":
|
||||
return None
|
||||
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".msg",
|
||||
feature="outlook",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
try:
|
||||
msg = olefile.OleFileIO(local_path)
|
||||
# Extract email metadata
|
||||
@@ -59,10 +81,12 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||
)
|
||||
|
||||
def _get_stream_data(
|
||||
self, msg: olefile.OleFileIO, stream_path: str
|
||||
) -> Union[str, None]:
|
||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||
assert isinstance(
|
||||
msg, olefile.OleFileIO
|
||||
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
|
||||
|
||||
try:
|
||||
if msg.exists(stream_path):
|
||||
data = msg.openstream(stream_path).read()
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import sys
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
@@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
|
||||
if extension.lower() != ".pdf":
|
||||
return None
|
||||
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=pdfminer.high_level.extract_text(local_path),
|
||||
|
||||
@@ -6,6 +6,13 @@ from typing import Any, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
# Mimetypes to ignore (commonly confused extensions)
|
||||
IGNORE_MIMETYPES = [
|
||||
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||
]
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
@@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
|
||||
"__placeholder" + kwargs.get("file_extension", "")
|
||||
)
|
||||
|
||||
# Ignore common false positives
|
||||
if content_type in IGNORE_MIMETYPES:
|
||||
content_type = None
|
||||
|
||||
# Only accept text files
|
||||
if content_type is None:
|
||||
return None
|
||||
|
||||
@@ -1,12 +1,22 @@
|
||||
import base64
|
||||
import pptx
|
||||
import re
|
||||
import html
|
||||
import sys
|
||||
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverterResult, DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pptx
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PptxConverter(HtmlConverter):
|
||||
@@ -54,9 +64,20 @@ class PptxConverter(HtmlConverter):
|
||||
if extension.lower() != ".pptx":
|
||||
return None
|
||||
|
||||
md_content = ""
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pptx",
|
||||
feature="pptx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
presentation = pptx.Presentation(local_path)
|
||||
md_content = ""
|
||||
slide_num = 0
|
||||
for slide in presentation.slides:
|
||||
slide_num += 1
|
||||
|
||||
@@ -1,9 +1,26 @@
|
||||
from typing import Union
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_xlsx_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import openpyxl
|
||||
except ImportError:
|
||||
_xlsx_dependency_exc_info = sys.exc_info()
|
||||
|
||||
_xls_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import xlrd
|
||||
except ImportError:
|
||||
_xls_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class XlsxConverter(HtmlConverter):
|
||||
@@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
|
||||
if extension.lower() != ".xlsx":
|
||||
return None
|
||||
|
||||
# Check the dependencies
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||
_xlsx_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
@@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
|
||||
if extension.lower() != ".xls":
|
||||
return None
|
||||
|
||||
# Load the dependencies
|
||||
if _xls_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xls",
|
||||
feature="xls",
|
||||
)
|
||||
) from _xls_dependency_exc_info[1].with_traceback(
|
||||
_xls_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
|
||||
Reference in New Issue
Block a user