Make it easier to use AzureKeyCredentials with Azure Doc Intelligence (#1151)

* Make it easier to use AzureKeyCredentials with Azure Doc Intelligence
* Fixed mypy type error.
* Added more fine-grained options over types.
* Pass doc intel options further up the stack.
This commit is contained in:
afourney
2025-03-26 10:44:11 -07:00
committed by GitHub
parent 9a951055f0
commit 9e067c42b6
3 changed files with 127 additions and 37 deletions

View File

@@ -10,7 +10,7 @@ import traceback
import io import io
from dataclasses import dataclass from dataclasses import dataclass
from importlib.metadata import entry_points from importlib.metadata import entry_points
from typing import Any, List, Optional, Union, BinaryIO from typing import Any, List, Dict, Optional, Union, BinaryIO
from pathlib import Path from pathlib import Path
from urllib.parse import urlparse from urllib.parse import urlparse
from warnings import warn from warnings import warn
@@ -198,8 +198,19 @@ class MarkItDown:
# Register Document Intelligence converter at the top of the stack if endpoint is provided # Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint") docintel_endpoint = kwargs.get("docintel_endpoint")
if docintel_endpoint is not None: if docintel_endpoint is not None:
docintel_args: Dict[str, Any] = {}
docintel_args["endpoint"] = docintel_endpoint
docintel_credential = kwargs.get("docintel_credential")
if docintel_credential is not None:
docintel_args["credential"] = docintel_credential
docintel_types = kwargs.get("docintel_file_types")
if docintel_types is not None:
docintel_args["file_types"] = docintel_types
self.register_converter( self.register_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint) DocumentIntelligenceConverter(**docintel_args),
) )
self._builtins_enabled = True self._builtins_enabled = True

View File

@@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
from ._audio_converter import AudioConverter from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import (
DocumentIntelligenceConverter,
DocumentIntelligenceFileType,
)
from ._epub_converter import EpubConverter from ._epub_converter import EpubConverter
__all__ = [ __all__ = [
@@ -38,5 +41,6 @@ __all__ = [
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",
"DocumentIntelligenceFileType",
"EpubConverter", "EpubConverter",
] ]

View File

@@ -1,7 +1,9 @@
import sys import sys
import re import re
import os
from typing import BinaryIO, Any, List from typing import BinaryIO, Any, List
from enum import Enum
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
@@ -18,6 +20,7 @@ try:
AnalyzeResult, AnalyzeResult,
DocumentAnalysisFeature, DocumentAnalysisFeature,
) )
from azure.core.credentials import AzureKeyCredential, TokenCredential
from azure.identity import DefaultAzureCredential from azure.identity import DefaultAzureCredential
except ImportError: except ImportError:
# Preserve the error and stack trace for later # Preserve the error and stack trace for later
@@ -29,38 +32,74 @@ except ImportError:
CONTENT_FORMAT = "markdown" CONTENT_FORMAT = "markdown"
OFFICE_MIME_TYPE_PREFIXES = [ class DocumentIntelligenceFileType(str, Enum):
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", """Enum of file types supported by the Document Intelligence Converter."""
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml",
"application/xhtml",
"text/html",
]
OTHER_MIME_TYPE_PREFIXES = [ # No OCR
"application/pdf", DOCX = "docx"
"application/x-pdf", PPTX = "pptx"
"text/html", XLSX = "xlsx"
"image/", HTML = "html"
] # OCR
PDF = "pdf"
JPEG = "jpeg"
PNG = "png"
BMP = "bmp"
TIFF = "tiff"
OFFICE_FILE_EXTENSIONS = [
".docx",
".xlsx",
".pptx",
".html",
".htm",
]
OTHER_FILE_EXTENSIONS = [ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
".pdf", """Get the MIME type prefixes for the given file types."""
".jpeg", prefixes: List[str] = []
".jpg", for type_ in types:
".png", if type_ == DocumentIntelligenceFileType.DOCX:
".bmp", prefixes.append(
".tiff", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
".heif", )
] elif type_ == DocumentIntelligenceFileType.PPTX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.presentationml"
)
elif type_ == DocumentIntelligenceFileType.XLSX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
elif type_ == DocumentIntelligenceFileType.PDF:
prefixes.append("application/pdf")
prefixes.append("application/x-pdf")
elif type_ == DocumentIntelligenceFileType.JPEG:
prefixes.append("image/jpeg")
elif type_ == DocumentIntelligenceFileType.PNG:
prefixes.append("image/png")
elif type_ == DocumentIntelligenceFileType.BMP:
prefixes.append("image/bmp")
elif type_ == DocumentIntelligenceFileType.TIFF:
prefixes.append("image/tiff")
return prefixes
def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
"""Get the file extensions for the given file types."""
extensions: List[str] = []
for type_ in types:
if type_ == DocumentIntelligenceFileType.DOCX:
extensions.append(".docx")
elif type_ == DocumentIntelligenceFileType.PPTX:
extensions.append(".pptx")
elif type_ == DocumentIntelligenceFileType.XLSX:
extensions.append(".xlsx")
elif type_ == DocumentIntelligenceFileType.PDF:
extensions.append(".pdf")
elif type_ == DocumentIntelligenceFileType.JPEG:
extensions.append(".jpg")
extensions.append(".jpeg")
elif type_ == DocumentIntelligenceFileType.PNG:
extensions.append(".png")
elif type_ == DocumentIntelligenceFileType.BMP:
extensions.append(".bmp")
elif type_ == DocumentIntelligenceFileType.TIFF:
extensions.append(".tiff")
return extensions
class DocumentIntelligenceConverter(DocumentConverter): class DocumentIntelligenceConverter(DocumentConverter):
@@ -71,8 +110,30 @@ class DocumentIntelligenceConverter(DocumentConverter):
*, *,
endpoint: str, endpoint: str,
api_version: str = "2024-07-31-preview", api_version: str = "2024-07-31-preview",
credential: AzureKeyCredential | TokenCredential | None = None,
file_types: List[DocumentIntelligenceFileType] = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
DocumentIntelligenceFileType.PDF,
DocumentIntelligenceFileType.JPEG,
DocumentIntelligenceFileType.PNG,
DocumentIntelligenceFileType.BMP,
DocumentIntelligenceFileType.TIFF,
],
): ):
"""
Initialize the DocumentIntelligenceConverter.
Args:
endpoint (str): The endpoint for the Document Intelligence service.
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
"""
super().__init__() super().__init__()
self._file_types = file_types
# Raise an error if the dependencies are not available. # Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated # This is different than other converters since this one isn't even instantiated
@@ -86,12 +147,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
_dependency_exc_info[2] _dependency_exc_info[2]
) )
if credential is None:
if os.environ.get("AZURE_API_KEY") is None:
credential = DefaultAzureCredential()
else:
credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
self.endpoint = endpoint self.endpoint = endpoint
self.api_version = api_version self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient( self.doc_intel_client = DocumentIntelligenceClient(
endpoint=self.endpoint, endpoint=self.endpoint,
api_version=self.api_version, api_version=self.api_version,
credential=DefaultAzureCredential(), credential=credential,
) )
def accepts( def accepts(
@@ -103,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS: if extension in _get_file_extensions(self._file_types):
return True return True
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES: for prefix in _get_mime_type_prefixes(self._file_types):
if mimetype.startswith(prefix): if mimetype.startswith(prefix):
return True return True
@@ -121,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower() mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower() extension = (stream_info.extension or "").lower()
if extension in OFFICE_FILE_EXTENSIONS: # Types that don't support ocr
no_ocr_types = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
DocumentIntelligenceFileType.HTML,
]
if extension in _get_file_extensions(no_ocr_types):
return [] return []
for prefix in OFFICE_MIME_TYPE_PREFIXES: for prefix in _get_mime_type_prefixes(no_ocr_types):
if mimetype.startswith(prefix): if mimetype.startswith(prefix):
return [] return []