Make it easier to use AzureKeyCredentials with Azure Doc Intelligence (#1151)

* Make it easier to use AzureKeyCredentials with Azure Doc Intelligence
* Fixed mypy type error.
* Added more fine-grained options over types.
* Pass doc intel options further up the stack.
This commit is contained in:
afourney
2025-03-26 10:44:11 -07:00
committed by GitHub
parent 9a951055f0
commit 9e067c42b6
3 changed files with 127 additions and 37 deletions

View File

@@ -10,7 +10,7 @@ import traceback
import io
from dataclasses import dataclass
from importlib.metadata import entry_points
from typing import Any, List, Optional, Union, BinaryIO
from typing import Any, List, Dict, Optional, Union, BinaryIO
from pathlib import Path
from urllib.parse import urlparse
from warnings import warn
@@ -198,8 +198,19 @@ class MarkItDown:
# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
if docintel_endpoint is not None:
docintel_args: Dict[str, Any] = {}
docintel_args["endpoint"] = docintel_endpoint
docintel_credential = kwargs.get("docintel_credential")
if docintel_credential is not None:
docintel_args["credential"] = docintel_credential
docintel_types = kwargs.get("docintel_file_types")
if docintel_types is not None:
docintel_args["file_types"] = docintel_types
self.register_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
DocumentIntelligenceConverter(**docintel_args),
)
self._builtins_enabled = True

View File

@@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter
from ._doc_intel_converter import (
DocumentIntelligenceConverter,
DocumentIntelligenceFileType,
)
from ._epub_converter import EpubConverter
__all__ = [
@@ -38,5 +41,6 @@ __all__ = [
"OutlookMsgConverter",
"ZipConverter",
"DocumentIntelligenceConverter",
"DocumentIntelligenceFileType",
"EpubConverter",
]

View File

@@ -1,7 +1,9 @@
import sys
import re
import os
from typing import BinaryIO, Any, List
from enum import Enum
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
@@ -18,6 +20,7 @@ try:
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.core.credentials import AzureKeyCredential, TokenCredential
from azure.identity import DefaultAzureCredential
except ImportError:
# Preserve the error and stack trace for later
@@ -29,38 +32,74 @@ except ImportError:
CONTENT_FORMAT = "markdown"
OFFICE_MIME_TYPE_PREFIXES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.presentationml",
"application/xhtml",
"text/html",
]
class DocumentIntelligenceFileType(str, Enum):
"""Enum of file types supported by the Document Intelligence Converter."""
OTHER_MIME_TYPE_PREFIXES = [
"application/pdf",
"application/x-pdf",
"text/html",
"image/",
]
# No OCR
DOCX = "docx"
PPTX = "pptx"
XLSX = "xlsx"
HTML = "html"
# OCR
PDF = "pdf"
JPEG = "jpeg"
PNG = "png"
BMP = "bmp"
TIFF = "tiff"
OFFICE_FILE_EXTENSIONS = [
".docx",
".xlsx",
".pptx",
".html",
".htm",
]
OTHER_FILE_EXTENSIONS = [
".pdf",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
"""Get the MIME type prefixes for the given file types."""
prefixes: List[str] = []
for type_ in types:
if type_ == DocumentIntelligenceFileType.DOCX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
elif type_ == DocumentIntelligenceFileType.PPTX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.presentationml"
)
elif type_ == DocumentIntelligenceFileType.XLSX:
prefixes.append(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
elif type_ == DocumentIntelligenceFileType.PDF:
prefixes.append("application/pdf")
prefixes.append("application/x-pdf")
elif type_ == DocumentIntelligenceFileType.JPEG:
prefixes.append("image/jpeg")
elif type_ == DocumentIntelligenceFileType.PNG:
prefixes.append("image/png")
elif type_ == DocumentIntelligenceFileType.BMP:
prefixes.append("image/bmp")
elif type_ == DocumentIntelligenceFileType.TIFF:
prefixes.append("image/tiff")
return prefixes
def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
"""Get the file extensions for the given file types."""
extensions: List[str] = []
for type_ in types:
if type_ == DocumentIntelligenceFileType.DOCX:
extensions.append(".docx")
elif type_ == DocumentIntelligenceFileType.PPTX:
extensions.append(".pptx")
elif type_ == DocumentIntelligenceFileType.XLSX:
extensions.append(".xlsx")
elif type_ == DocumentIntelligenceFileType.PDF:
extensions.append(".pdf")
elif type_ == DocumentIntelligenceFileType.JPEG:
extensions.append(".jpg")
extensions.append(".jpeg")
elif type_ == DocumentIntelligenceFileType.PNG:
extensions.append(".png")
elif type_ == DocumentIntelligenceFileType.BMP:
extensions.append(".bmp")
elif type_ == DocumentIntelligenceFileType.TIFF:
extensions.append(".tiff")
return extensions
class DocumentIntelligenceConverter(DocumentConverter):
@@ -71,8 +110,30 @@ class DocumentIntelligenceConverter(DocumentConverter):
*,
endpoint: str,
api_version: str = "2024-07-31-preview",
credential: AzureKeyCredential | TokenCredential | None = None,
file_types: List[DocumentIntelligenceFileType] = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
DocumentIntelligenceFileType.PDF,
DocumentIntelligenceFileType.JPEG,
DocumentIntelligenceFileType.PNG,
DocumentIntelligenceFileType.BMP,
DocumentIntelligenceFileType.TIFF,
],
):
"""
Initialize the DocumentIntelligenceConverter.
Args:
endpoint (str): The endpoint for the Document Intelligence service.
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
"""
super().__init__()
self._file_types = file_types
# Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated
@@ -86,12 +147,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
_dependency_exc_info[2]
)
if credential is None:
if os.environ.get("AZURE_API_KEY") is None:
credential = DefaultAzureCredential()
else:
credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(
endpoint=self.endpoint,
api_version=self.api_version,
credential=DefaultAzureCredential(),
credential=credential,
)
def accepts(
@@ -103,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
if extension in _get_file_extensions(self._file_types):
return True
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
for prefix in _get_mime_type_prefixes(self._file_types):
if mimetype.startswith(prefix):
return True
@@ -121,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in OFFICE_FILE_EXTENSIONS:
# Types that don't support ocr
no_ocr_types = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
DocumentIntelligenceFileType.XLSX,
DocumentIntelligenceFileType.HTML,
]
if extension in _get_file_extensions(no_ocr_types):
return []
for prefix in OFFICE_MIME_TYPE_PREFIXES:
for prefix in _get_mime_type_prefixes(no_ocr_types):
if mimetype.startswith(prefix):
return []