Exploring ways to allow Optional dependencies (#1079)
* Enable optional dependencies. Starting with pptx. * Fix CLI tests.... have them install [all] * Added .docx to optional dependencies * Reuse error messages for missing dependencies. * Added xlsx and xls * Added pdfs * Added Ole files. * Updated READMEs, and finished remaining feature-categories. * Move OpenAI to hatch-test environment.
This commit is contained in:
29
README.md
29
README.md
@@ -5,7 +5,8 @@
|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
> [!IMPORTANT]
|
||||
> MarkItDown 0.0.2 alpha 1 (0.0.2a1) introduces a plugin-based architecture. As much as was possible, command-line and Python interfaces have remained the same as 0.0.1a3 to support backward compatibility. Please report any issues you encounter. Some interface changes may yet occur as we continue to refine MarkItDown to a first non-alpha release.
|
||||
> Breaking changes between 0.0.1 to 0.0.2:
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior.
|
||||
|
||||
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||
It supports:
|
||||
@@ -22,12 +23,12 @@ It supports:
|
||||
- Youtube URLs
|
||||
- ... and more!
|
||||
|
||||
To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source:
|
||||
To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
cd markitdown
|
||||
pip install -e packages/markitdown
|
||||
pip install -e packages/markitdown[all]
|
||||
```
|
||||
|
||||
## Usage
|
||||
@@ -50,6 +51,28 @@ You can also pipe content:
|
||||
cat path-to-file.pdf | markitdown
|
||||
```
|
||||
|
||||
### Optional Dependencies
|
||||
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
|
||||
|
||||
```bash
|
||||
pip install markitdown[pdf, docx, pptx]
|
||||
```
|
||||
|
||||
will install only the dependencies for PDF, DOCX, and PPTX files.
|
||||
|
||||
At the moment, the following optional dependencies are available:
|
||||
|
||||
* `[all]` Installs all optional dependencies
|
||||
* `[pptx]` Installs dependencies for PowerPoint files
|
||||
* `[docx]` Installs dependencies for Word files
|
||||
* `[xlsx]` Installs dependencies for Excel files
|
||||
* `[xls]` Installs dependencies for older Excel files
|
||||
* `[pdf]` Installs dependencies for PDF files
|
||||
* `[outlook]` Installs dependencies for Outlook messages
|
||||
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
||||
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
||||
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
||||
|
||||
### Plugins
|
||||
|
||||
MarkItDown also supports 3rd-party plugins. Plugins are disabled by default. To list installed plugins:
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
From PyPI:
|
||||
|
||||
```bash
|
||||
pip install markitdown
|
||||
pip install markitdown[all]
|
||||
```
|
||||
|
||||
From source:
|
||||
@@ -18,7 +18,7 @@ From source:
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
cd markitdown
|
||||
pip install -e packages/markitdown
|
||||
pip install -e packages/markitdown[all]
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -26,25 +26,36 @@ classifiers = [
|
||||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"mammoth",
|
||||
"markdownify~=0.14.1",
|
||||
"numpy",
|
||||
"puremagic",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
all = [
|
||||
"python-pptx",
|
||||
"mammoth",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"pdfminer.six",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
"olefile",
|
||||
"youtube-transcript-api",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
"openai",
|
||||
"youtube-transcript-api",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
outlook = ["olefile"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
@@ -57,6 +68,15 @@ path = "src/markitdown/__about__.py"
|
||||
[project.scripts]
|
||||
markitdown = "markitdown.__main__:main"
|
||||
|
||||
[tool.hatch.envs.default]
|
||||
features = ["all"]
|
||||
|
||||
[tool.hatch.envs.hatch-test]
|
||||
features = ["all"]
|
||||
extra-dependencies = [
|
||||
"openai",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
"mypy>=1.0.0",
|
||||
|
||||
@@ -6,7 +6,7 @@ from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown
|
||||
from ._exceptions import (
|
||||
MarkItDownException,
|
||||
ConverterPrerequisiteException,
|
||||
MissingDependencyException,
|
||||
FailedConversionAttempt,
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
@@ -19,7 +19,7 @@ __all__ = [
|
||||
"DocumentConverter",
|
||||
"DocumentConverterResult",
|
||||
"MarkItDownException",
|
||||
"ConverterPrerequisiteException",
|
||||
"MissingDependencyException",
|
||||
"FailedConversionAttempt",
|
||||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
from typing import Optional, List, Any
|
||||
|
||||
MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example:
|
||||
|
||||
* pip install markitdown[{feature}]
|
||||
* pip install markitdown[all]
|
||||
* pip install markitdown[{feature}, ...]
|
||||
* etc."""
|
||||
|
||||
|
||||
class MarkItDownException(Exception):
|
||||
"""
|
||||
@@ -9,15 +16,16 @@ class MarkItDownException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ConverterPrerequisiteException(MarkItDownException):
|
||||
class MissingDependencyException(MarkItDownException):
|
||||
"""
|
||||
Thrown when instantiating a DocumentConverter in cases where
|
||||
a required library or dependency is not installed, an API key
|
||||
is not set, or some other prerequisite is not met.
|
||||
Converters shipped with MarkItDown may depend on optional
|
||||
dependencies. This exception is thrown when a converter's
|
||||
convert() method is called, but the required dependency is not
|
||||
installed. This is not necessarily a fatal error, as the converter
|
||||
will simply be skipped (an error will bubble up only if no other
|
||||
suitable converter is found).
|
||||
|
||||
This is not necessarily a fatal error. If thrown during
|
||||
MarkItDown's plugin loading phase, the converter will simply be
|
||||
skipped, and a warning will be issued.
|
||||
Error messages should clearly indicate which dependency is missing.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
@@ -42,7 +42,6 @@ from .converters import (
|
||||
from ._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
ConverterPrerequisiteException,
|
||||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,16 +1,24 @@
|
||||
from typing import Any, Union
|
||||
import re
|
||||
|
||||
# Azure imports
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import (
|
||||
AnalyzeDocumentRequest,
|
||||
AnalyzeResult,
|
||||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.identity import DefaultAzureCredential
|
||||
import sys
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import (
|
||||
AnalyzeDocumentRequest,
|
||||
AnalyzeResult,
|
||||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.identity import DefaultAzureCredential
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
@@ -30,6 +38,16 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
# Raise an error if the dependencies are not available.
|
||||
# This is different than other converters since this one isn't even instantiated
|
||||
# unless explicitly requested.
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.doc_intel_client = DocumentIntelligenceClient(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import Union
|
||||
import sys
|
||||
|
||||
import mammoth
|
||||
from typing import Union
|
||||
|
||||
from ._base import (
|
||||
DocumentConverterResult,
|
||||
@@ -8,6 +8,16 @@ from ._base import (
|
||||
|
||||
from ._base import DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class DocxConverter(HtmlConverter):
|
||||
@@ -26,6 +36,18 @@ class DocxConverter(HtmlConverter):
|
||||
if extension.lower() != ".docx":
|
||||
return None
|
||||
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
result = None
|
||||
with open(local_path, "rb") as docx_file:
|
||||
style_map = kwargs.get("style_map", None)
|
||||
|
||||
@@ -7,7 +7,7 @@ import mimetypes
|
||||
|
||||
class ImageConverter(MediaConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
||||
@@ -1,6 +1,16 @@
|
||||
import olefile
|
||||
import sys
|
||||
from typing import Any, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import olefile
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class OutlookMsgConverter(DocumentConverter):
|
||||
@@ -24,6 +34,18 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
if extension.lower() != ".msg":
|
||||
return None
|
||||
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".msg",
|
||||
feature="outlook",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
try:
|
||||
msg = olefile.OleFileIO(local_path)
|
||||
# Extract email metadata
|
||||
@@ -59,10 +81,12 @@ class OutlookMsgConverter(DocumentConverter):
|
||||
f"Could not convert MSG file '{local_path}': {str(e)}"
|
||||
)
|
||||
|
||||
def _get_stream_data(
|
||||
self, msg: olefile.OleFileIO, stream_path: str
|
||||
) -> Union[str, None]:
|
||||
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
|
||||
"""Helper to safely extract and decode stream data from the MSG file."""
|
||||
assert isinstance(
|
||||
msg, olefile.OleFileIO
|
||||
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
|
||||
|
||||
try:
|
||||
if msg.exists(stream_path):
|
||||
data = msg.openstream(stream_path).read()
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import sys
|
||||
from typing import Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
@@ -20,6 +30,18 @@ class PdfConverter(DocumentConverter):
|
||||
if extension.lower() != ".pdf":
|
||||
return None
|
||||
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
text_content=pdfminer.high_level.extract_text(local_path),
|
||||
|
||||
@@ -6,6 +6,13 @@ from typing import Any, Union
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
|
||||
|
||||
# Mimetypes to ignore (commonly confused extensions)
|
||||
IGNORE_MIMETYPES = [
|
||||
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
|
||||
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
|
||||
]
|
||||
|
||||
|
||||
class PlainTextConverter(DocumentConverter):
|
||||
"""Anything with content type text/plain"""
|
||||
|
||||
@@ -22,6 +29,10 @@ class PlainTextConverter(DocumentConverter):
|
||||
"__placeholder" + kwargs.get("file_extension", "")
|
||||
)
|
||||
|
||||
# Ignore common false positives
|
||||
if content_type in IGNORE_MIMETYPES:
|
||||
content_type = None
|
||||
|
||||
# Only accept text files
|
||||
if content_type is None:
|
||||
return None
|
||||
|
||||
@@ -1,12 +1,22 @@
|
||||
import base64
|
||||
import pptx
|
||||
import re
|
||||
import html
|
||||
import sys
|
||||
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverterResult, DocumentConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pptx
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PptxConverter(HtmlConverter):
|
||||
@@ -54,9 +64,20 @@ class PptxConverter(HtmlConverter):
|
||||
if extension.lower() != ".pptx":
|
||||
return None
|
||||
|
||||
md_content = ""
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pptx",
|
||||
feature="pptx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
presentation = pptx.Presentation(local_path)
|
||||
md_content = ""
|
||||
slide_num = 0
|
||||
for slide in presentation.slides:
|
||||
slide_num += 1
|
||||
|
||||
@@ -1,9 +1,26 @@
|
||||
from typing import Union
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
from typing import Union
|
||||
|
||||
from ._base import DocumentConverter, DocumentConverterResult
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_xlsx_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import openpyxl
|
||||
except ImportError:
|
||||
_xlsx_dependency_exc_info = sys.exc_info()
|
||||
|
||||
_xls_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
import xlrd
|
||||
except ImportError:
|
||||
_xls_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class XlsxConverter(HtmlConverter):
|
||||
@@ -22,6 +39,18 @@ class XlsxConverter(HtmlConverter):
|
||||
if extension.lower() != ".xlsx":
|
||||
return None
|
||||
|
||||
# Check the dependencies
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||
_xlsx_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
@@ -46,6 +75,18 @@ class XlsConverter(HtmlConverter):
|
||||
if extension.lower() != ".xls":
|
||||
return None
|
||||
|
||||
# Load the dependencies
|
||||
if _xls_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xls",
|
||||
feature="xls",
|
||||
)
|
||||
) from _xls_dependency_exc_info[1].with_traceback(
|
||||
_xls_dependency_exc_info[2]
|
||||
) # Restore the original traceback
|
||||
|
||||
sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
|
||||
md_content = ""
|
||||
for s in sheets:
|
||||
|
||||
Reference in New Issue
Block a user