Make it easier to use AzureKeyCredentials with Azure Doc Intelligence (#1151)

* Make it easier to use AzureKeyCredentials with Azure Doc Intelligence * Fixed mypy type error. * Added more fine-grained options over types. * Pass doc intel options further up the stack.
2025-03-26 10:44:11 -07:00
parent 9a951055f0
commit 9e067c42b6
3 changed files with 127 additions and 37 deletions
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -10,7 +10,7 @@ import traceback
 import io
 from dataclasses import dataclass
 from importlib.metadata import entry_points
-from typing import Any, List, Optional, Union, BinaryIO
+from typing import Any, List, Dict, Optional, Union, BinaryIO
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
@@ -198,8 +198,19 @@ class MarkItDown:
            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
            if docintel_endpoint is not None:
+                docintel_args: Dict[str, Any] = {}
+                docintel_args["endpoint"] = docintel_endpoint
+
+                docintel_credential = kwargs.get("docintel_credential")
+                if docintel_credential is not None:
+                    docintel_args["credential"] = docintel_credential
+
+                docintel_types = kwargs.get("docintel_file_types")
+                if docintel_types is not None:
+                    docintel_args["file_types"] = docintel_types
+
                self.register_converter(
-                    DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+                    DocumentIntelligenceConverter(**docintel_args),
                )

            self._builtins_enabled = True
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
 from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
-from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._doc_intel_converter import (
+    DocumentIntelligenceConverter,
+    DocumentIntelligenceFileType,
+)
 from ._epub_converter import EpubConverter

 __all__ = [
@@ -38,5 +41,6 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
+    "DocumentIntelligenceFileType",
    "EpubConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,7 +1,9 @@
 import sys
 import re
+import os

 from typing import BinaryIO, Any, List
+from enum import Enum

 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
@@ -18,6 +20,7 @@ try:
        AnalyzeResult,
        DocumentAnalysisFeature,
    )
+    from azure.core.credentials import AzureKeyCredential, TokenCredential
    from azure.identity import DefaultAzureCredential
 except ImportError:
    # Preserve the error and stack trace for later
@@ -29,38 +32,74 @@ except ImportError:
 CONTENT_FORMAT = "markdown"


-OFFICE_MIME_TYPE_PREFIXES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-    "application/vnd.openxmlformats-officedocument.presentationml",
-    "application/xhtml",
-    "text/html",
-]
+class DocumentIntelligenceFileType(str, Enum):
+    """Enum of file types supported by the Document Intelligence Converter."""

-OTHER_MIME_TYPE_PREFIXES = [
-    "application/pdf",
-    "application/x-pdf",
-    "text/html",
-    "image/",
-]
+    # No OCR
+    DOCX = "docx"
+    PPTX = "pptx"
+    XLSX = "xlsx"
+    HTML = "html"
+    # OCR
+    PDF = "pdf"
+    JPEG = "jpeg"
+    PNG = "png"
+    BMP = "bmp"
+    TIFF = "tiff"

-OFFICE_FILE_EXTENSIONS = [
-    ".docx",
-    ".xlsx",
-    ".pptx",
-    ".html",
-    ".htm",
-]

-OTHER_FILE_EXTENSIONS = [
-    ".pdf",
-    ".jpeg",
-    ".jpg",
-    ".png",
-    ".bmp",
-    ".tiff",
-    ".heif",
-]
+def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
+    """Get the MIME type prefixes for the given file types."""
+    prefixes: List[str] = []
+    for type_ in types:
+        if type_ == DocumentIntelligenceFileType.DOCX:
+            prefixes.append(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+            )
+        elif type_ == DocumentIntelligenceFileType.PPTX:
+            prefixes.append(
+                "application/vnd.openxmlformats-officedocument.presentationml"
+            )
+        elif type_ == DocumentIntelligenceFileType.XLSX:
+            prefixes.append(
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            )
+        elif type_ == DocumentIntelligenceFileType.PDF:
+            prefixes.append("application/pdf")
+            prefixes.append("application/x-pdf")
+        elif type_ == DocumentIntelligenceFileType.JPEG:
+            prefixes.append("image/jpeg")
+        elif type_ == DocumentIntelligenceFileType.PNG:
+            prefixes.append("image/png")
+        elif type_ == DocumentIntelligenceFileType.BMP:
+            prefixes.append("image/bmp")
+        elif type_ == DocumentIntelligenceFileType.TIFF:
+            prefixes.append("image/tiff")
+    return prefixes
+
+
+def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
+    """Get the file extensions for the given file types."""
+    extensions: List[str] = []
+    for type_ in types:
+        if type_ == DocumentIntelligenceFileType.DOCX:
+            extensions.append(".docx")
+        elif type_ == DocumentIntelligenceFileType.PPTX:
+            extensions.append(".pptx")
+        elif type_ == DocumentIntelligenceFileType.XLSX:
+            extensions.append(".xlsx")
+        elif type_ == DocumentIntelligenceFileType.PDF:
+            extensions.append(".pdf")
+        elif type_ == DocumentIntelligenceFileType.JPEG:
+            extensions.append(".jpg")
+            extensions.append(".jpeg")
+        elif type_ == DocumentIntelligenceFileType.PNG:
+            extensions.append(".png")
+        elif type_ == DocumentIntelligenceFileType.BMP:
+            extensions.append(".bmp")
+        elif type_ == DocumentIntelligenceFileType.TIFF:
+            extensions.append(".tiff")
+    return extensions


 class DocumentIntelligenceConverter(DocumentConverter):
@@ -71,8 +110,30 @@ class DocumentIntelligenceConverter(DocumentConverter):
        *,
        endpoint: str,
        api_version: str = "2024-07-31-preview",
+        credential: AzureKeyCredential | TokenCredential | None = None,
+        file_types: List[DocumentIntelligenceFileType] = [
+            DocumentIntelligenceFileType.DOCX,
+            DocumentIntelligenceFileType.PPTX,
+            DocumentIntelligenceFileType.XLSX,
+            DocumentIntelligenceFileType.PDF,
+            DocumentIntelligenceFileType.JPEG,
+            DocumentIntelligenceFileType.PNG,
+            DocumentIntelligenceFileType.BMP,
+            DocumentIntelligenceFileType.TIFF,
+        ],
    ):
+        """
+        Initialize the DocumentIntelligenceConverter.
+
+        Args:
+            endpoint (str): The endpoint for the Document Intelligence service.
+            api_version (str): The API version to use. Defaults to "2024-07-31-preview".
+            credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
+            file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
+        """
+
        super().__init__()
+        self._file_types = file_types

        # Raise an error if the dependencies are not available.
        # This is different than other converters since this one isn't even instantiated
@@ -86,12 +147,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
                _dependency_exc_info[2]
            )

+        if credential is None:
+            if os.environ.get("AZURE_API_KEY") is None:
+                credential = DefaultAzureCredential()
+            else:
+                credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
+
        self.endpoint = endpoint
        self.api_version = api_version
        self.doc_intel_client = DocumentIntelligenceClient(
            endpoint=self.endpoint,
            api_version=self.api_version,
-            credential=DefaultAzureCredential(),
+            credential=credential,
        )

    def accepts(
@@ -103,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()

-        if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
+        if extension in _get_file_extensions(self._file_types):
            return True

-        for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(self._file_types):
            if mimetype.startswith(prefix):
                return True

@@ -121,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()

-        if extension in OFFICE_FILE_EXTENSIONS:
+        # Types that don't support ocr
+        no_ocr_types = [
+            DocumentIntelligenceFileType.DOCX,
+            DocumentIntelligenceFileType.PPTX,
+            DocumentIntelligenceFileType.XLSX,
+            DocumentIntelligenceFileType.HTML,
+        ]
+
+        if extension in _get_file_extensions(no_ocr_types):
            return []

-        for prefix in OFFICE_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(no_ocr_types):
            if mimetype.startswith(prefix):
                return []