Make it easier to use AzureKeyCredentials with Azure Doc Intelligence (#1151)

* Make it easier to use AzureKeyCredentials with Azure Doc Intelligence * Fixed mypy type error. * Added more fine-grained options over types. * Pass doc intel options further up the stack.
2025-03-26 10:44:11 -07:00
parent 9a951055f0
commit 9e067c42b6
3 changed files with 127 additions and 37 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -10,7 +10,7 @@ import traceback
 import io
 from dataclasses import dataclass
 from importlib.metadata import entry_points
-from typing import Any, List, Optional, Union, BinaryIO
+from typing import Any, List, Dict, Optional, Union, BinaryIO
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
@@ -198,8 +198,19 @@ class MarkItDown:
            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
            if docintel_endpoint is not None:
                docintel_args: Dict[str, Any] = {}
                docintel_args["endpoint"] = docintel_endpoint
                docintel_credential = kwargs.get("docintel_credential")
                if docintel_credential is not None:
                    docintel_args["credential"] = docintel_credential
                docintel_types = kwargs.get("docintel_file_types")
                if docintel_types is not None:
                    docintel_args["file_types"] = docintel_types
                self.register_converter(
-                    DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+                    DocumentIntelligenceConverter(**docintel_args),
                )
            self._builtins_enabled = True
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
 from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
-from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._doc_intel_converter import (
    DocumentIntelligenceConverter,
    DocumentIntelligenceFileType,
 )
 from ._epub_converter import EpubConverter
 __all__ = [
@@ -38,5 +41,6 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
    "DocumentIntelligenceFileType",
    "EpubConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,7 +1,9 @@
 import sys
 import re
 import os
 from typing import BinaryIO, Any, List
 from enum import Enum
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
@@ -18,6 +20,7 @@ try:
        AnalyzeResult,
        DocumentAnalysisFeature,
    )
    from azure.core.credentials import AzureKeyCredential, TokenCredential
    from azure.identity import DefaultAzureCredential
 except ImportError:
    # Preserve the error and stack trace for later
@@ -29,38 +32,74 @@ except ImportError:
 CONTENT_FORMAT = "markdown"
-OFFICE_MIME_TYPE_PREFIXES = [
+class DocumentIntelligenceFileType(str, Enum):
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    """Enum of file types supported by the Document Intelligence Converter."""
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "application/vnd.openxmlformats-officedocument.presentationml",
    "application/xhtml",
    "text/html",
 ]
-OTHER_MIME_TYPE_PREFIXES = [
+    # No OCR
-    "application/pdf",
+    DOCX = "docx"
-    "application/x-pdf",
+    PPTX = "pptx"
-    "text/html",
+    XLSX = "xlsx"
-    "image/",
+    HTML = "html"
-]
+    # OCR
    PDF = "pdf"
    JPEG = "jpeg"
    PNG = "png"
    BMP = "bmp"
    TIFF = "tiff"
 OFFICE_FILE_EXTENSIONS = [
    ".docx",
    ".xlsx",
    ".pptx",
    ".html",
    ".htm",
 ]
-OTHER_FILE_EXTENSIONS = [
+def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
-    ".pdf",
+    """Get the MIME type prefixes for the given file types."""
-    ".jpeg",
+    prefixes: List[str] = []
-    ".jpg",
+    for type_ in types:
-    ".png",
+        if type_ == DocumentIntelligenceFileType.DOCX:
-    ".bmp",
+            prefixes.append(
-    ".tiff",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-    ".heif",
+            )
-]
+        elif type_ == DocumentIntelligenceFileType.PPTX:
            prefixes.append(
                "application/vnd.openxmlformats-officedocument.presentationml"
            )
        elif type_ == DocumentIntelligenceFileType.XLSX:
            prefixes.append(
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        elif type_ == DocumentIntelligenceFileType.PDF:
            prefixes.append("application/pdf")
            prefixes.append("application/x-pdf")
        elif type_ == DocumentIntelligenceFileType.JPEG:
            prefixes.append("image/jpeg")
        elif type_ == DocumentIntelligenceFileType.PNG:
            prefixes.append("image/png")
        elif type_ == DocumentIntelligenceFileType.BMP:
            prefixes.append("image/bmp")
        elif type_ == DocumentIntelligenceFileType.TIFF:
            prefixes.append("image/tiff")
    return prefixes
 def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
    """Get the file extensions for the given file types."""
    extensions: List[str] = []
    for type_ in types:
        if type_ == DocumentIntelligenceFileType.DOCX:
            extensions.append(".docx")
        elif type_ == DocumentIntelligenceFileType.PPTX:
            extensions.append(".pptx")
        elif type_ == DocumentIntelligenceFileType.XLSX:
            extensions.append(".xlsx")
        elif type_ == DocumentIntelligenceFileType.PDF:
            extensions.append(".pdf")
        elif type_ == DocumentIntelligenceFileType.JPEG:
            extensions.append(".jpg")
            extensions.append(".jpeg")
        elif type_ == DocumentIntelligenceFileType.PNG:
            extensions.append(".png")
        elif type_ == DocumentIntelligenceFileType.BMP:
            extensions.append(".bmp")
        elif type_ == DocumentIntelligenceFileType.TIFF:
            extensions.append(".tiff")
    return extensions
 class DocumentIntelligenceConverter(DocumentConverter):
@@ -71,8 +110,30 @@ class DocumentIntelligenceConverter(DocumentConverter):
        *,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
        credential: AzureKeyCredential | TokenCredential | None = None,
        file_types: List[DocumentIntelligenceFileType] = [
            DocumentIntelligenceFileType.DOCX,
            DocumentIntelligenceFileType.PPTX,
            DocumentIntelligenceFileType.XLSX,
            DocumentIntelligenceFileType.PDF,
            DocumentIntelligenceFileType.JPEG,
            DocumentIntelligenceFileType.PNG,
            DocumentIntelligenceFileType.BMP,
            DocumentIntelligenceFileType.TIFF,
        ],
    ):
        """
        Initialize the DocumentIntelligenceConverter.
        Args:
            endpoint (str): The endpoint for the Document Intelligence service.
            api_version (str): The API version to use. Defaults to "2024-07-31-preview".
            credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
            file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
        """
        super().__init__()
        self._file_types = file_types
        # Raise an error if the dependencies are not available.
        # This is different than other converters since this one isn't even instantiated
@@ -86,12 +147,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
                _dependency_exc_info[2]
            )
        if credential is None:
            if os.environ.get("AZURE_API_KEY") is None:
                credential = DefaultAzureCredential()
            else:
                credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
            endpoint=self.endpoint,
            api_version=self.api_version,
-            credential=DefaultAzureCredential(),
+            credential=credential,
        )
    def accepts(
@@ -103,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
-        if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
+        if extension in _get_file_extensions(self._file_types):
            return True
-        for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(self._file_types):
            if mimetype.startswith(prefix):
                return True
@@ -121,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
-        if extension in OFFICE_FILE_EXTENSIONS:
+        # Types that don't support ocr
        no_ocr_types = [
            DocumentIntelligenceFileType.DOCX,
            DocumentIntelligenceFileType.PPTX,
            DocumentIntelligenceFileType.XLSX,
            DocumentIntelligenceFileType.HTML,
        ]
        if extension in _get_file_extensions(no_ocr_types):
            return []
-        for prefix in OFFICE_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(no_ocr_types):
            if mimetype.startswith(prefix):
                return []