From 9e067c42b647eaf14e842e70e47540b36c0c4a08 Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Wed, 26 Mar 2025 10:44:11 -0700
Subject: [PATCH] Make it easier to use AzureKeyCredentials with Azure Doc
 Intelligence (#1151)

* Make it easier to use AzureKeyCredentials with Azure Doc Intelligence
* Fixed mypy type error.
* Added more fine-grained options over types.
* Pass doc intel options further up the stack.
---
 .../markitdown/src/markitdown/_markitdown.py  |  15 +-
 .../src/markitdown/converters/__init__.py     |   6 +-
 .../converters/_doc_intel_converter.py        | 143 +++++++++++++-----
 3 files changed, 127 insertions(+), 37 deletions(-)

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index 8f58db4..54a0dc8 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -10,7 +10,7 @@ import traceback
 import io
 from dataclasses import dataclass
 from importlib.metadata import entry_points
-from typing import Any, List, Optional, Union, BinaryIO
+from typing import Any, List, Dict, Optional, Union, BinaryIO
 from pathlib import Path
 from urllib.parse import urlparse
 from warnings import warn
@@ -198,8 +198,19 @@ class MarkItDown:
             # Register Document Intelligence converter at the top of the stack if endpoint is provided
             docintel_endpoint = kwargs.get("docintel_endpoint")
             if docintel_endpoint is not None:
+                docintel_args: Dict[str, Any] = {}
+                docintel_args["endpoint"] = docintel_endpoint
+
+                docintel_credential = kwargs.get("docintel_credential")
+                if docintel_credential is not None:
+                    docintel_args["credential"] = docintel_credential
+
+                docintel_types = kwargs.get("docintel_file_types")
+                if docintel_types is not None:
+                    docintel_args["file_types"] = docintel_types
+
                 self.register_converter(
-                    DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+                    DocumentIntelligenceConverter(**docintel_args),
                 )
 
             self._builtins_enabled = True
diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py
index 09e3cb1..c68d0c3 100644
--- a/packages/markitdown/src/markitdown/converters/__init__.py
+++ b/packages/markitdown/src/markitdown/converters/__init__.py
@@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
 from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
-from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._doc_intel_converter import (
+    DocumentIntelligenceConverter,
+    DocumentIntelligenceFileType,
+)
 from ._epub_converter import EpubConverter
 
 __all__ = [
@@ -38,5 +41,6 @@ __all__ = [
     "OutlookMsgConverter",
     "ZipConverter",
     "DocumentIntelligenceConverter",
+    "DocumentIntelligenceFileType",
     "EpubConverter",
 ]
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
index 2f116d0..5f4069b 100644
--- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -1,7 +1,9 @@
 import sys
 import re
+import os
 
 from typing import BinaryIO, Any, List
+from enum import Enum
 
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
@@ -18,6 +20,7 @@ try:
         AnalyzeResult,
         DocumentAnalysisFeature,
     )
+    from azure.core.credentials import AzureKeyCredential, TokenCredential
     from azure.identity import DefaultAzureCredential
 except ImportError:
     # Preserve the error and stack trace for later
@@ -29,38 +32,74 @@ except ImportError:
 CONTENT_FORMAT = "markdown"
 
 
-OFFICE_MIME_TYPE_PREFIXES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-    "application/vnd.openxmlformats-officedocument.presentationml",
-    "application/xhtml",
-    "text/html",
-]
+class DocumentIntelligenceFileType(str, Enum):
+    """Enum of file types supported by the Document Intelligence Converter."""
 
-OTHER_MIME_TYPE_PREFIXES = [
-    "application/pdf",
-    "application/x-pdf",
-    "text/html",
-    "image/",
-]
+    # No OCR
+    DOCX = "docx"
+    PPTX = "pptx"
+    XLSX = "xlsx"
+    HTML = "html"
+    # OCR
+    PDF = "pdf"
+    JPEG = "jpeg"
+    PNG = "png"
+    BMP = "bmp"
+    TIFF = "tiff"
 
-OFFICE_FILE_EXTENSIONS = [
-    ".docx",
-    ".xlsx",
-    ".pptx",
-    ".html",
-    ".htm",
-]
 
-OTHER_FILE_EXTENSIONS = [
-    ".pdf",
-    ".jpeg",
-    ".jpg",
-    ".png",
-    ".bmp",
-    ".tiff",
-    ".heif",
-]
+def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
+    """Get the MIME type prefixes for the given file types."""
+    prefixes: List[str] = []
+    for type_ in types:
+        if type_ == DocumentIntelligenceFileType.DOCX:
+            prefixes.append(
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+            )
+        elif type_ == DocumentIntelligenceFileType.PPTX:
+            prefixes.append(
+                "application/vnd.openxmlformats-officedocument.presentationml"
+            )
+        elif type_ == DocumentIntelligenceFileType.XLSX:
+            prefixes.append(
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            )
+        elif type_ == DocumentIntelligenceFileType.PDF:
+            prefixes.append("application/pdf")
+            prefixes.append("application/x-pdf")
+        elif type_ == DocumentIntelligenceFileType.JPEG:
+            prefixes.append("image/jpeg")
+        elif type_ == DocumentIntelligenceFileType.PNG:
+            prefixes.append("image/png")
+        elif type_ == DocumentIntelligenceFileType.BMP:
+            prefixes.append("image/bmp")
+        elif type_ == DocumentIntelligenceFileType.TIFF:
+            prefixes.append("image/tiff")
+    return prefixes
+
+
+def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
+    """Get the file extensions for the given file types."""
+    extensions: List[str] = []
+    for type_ in types:
+        if type_ == DocumentIntelligenceFileType.DOCX:
+            extensions.append(".docx")
+        elif type_ == DocumentIntelligenceFileType.PPTX:
+            extensions.append(".pptx")
+        elif type_ == DocumentIntelligenceFileType.XLSX:
+            extensions.append(".xlsx")
+        elif type_ == DocumentIntelligenceFileType.PDF:
+            extensions.append(".pdf")
+        elif type_ == DocumentIntelligenceFileType.JPEG:
+            extensions.append(".jpg")
+            extensions.append(".jpeg")
+        elif type_ == DocumentIntelligenceFileType.PNG:
+            extensions.append(".png")
+        elif type_ == DocumentIntelligenceFileType.BMP:
+            extensions.append(".bmp")
+        elif type_ == DocumentIntelligenceFileType.TIFF:
+            extensions.append(".tiff")
+    return extensions
 
 
 class DocumentIntelligenceConverter(DocumentConverter):
@@ -71,8 +110,30 @@ class DocumentIntelligenceConverter(DocumentConverter):
         *,
         endpoint: str,
         api_version: str = "2024-07-31-preview",
+        credential: AzureKeyCredential | TokenCredential | None = None,
+        file_types: List[DocumentIntelligenceFileType] = [
+            DocumentIntelligenceFileType.DOCX,
+            DocumentIntelligenceFileType.PPTX,
+            DocumentIntelligenceFileType.XLSX,
+            DocumentIntelligenceFileType.PDF,
+            DocumentIntelligenceFileType.JPEG,
+            DocumentIntelligenceFileType.PNG,
+            DocumentIntelligenceFileType.BMP,
+            DocumentIntelligenceFileType.TIFF,
+        ],
     ):
+        """
+        Initialize the DocumentIntelligenceConverter.
+
+        Args:
+            endpoint (str): The endpoint for the Document Intelligence service.
+            api_version (str): The API version to use. Defaults to "2024-07-31-preview".
+            credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
+            file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
+        """
+
         super().__init__()
+        self._file_types = file_types
 
         # Raise an error if the dependencies are not available.
         # This is different than other converters since this one isn't even instantiated
@@ -86,12 +147,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
                 _dependency_exc_info[2]
             )
 
+        if credential is None:
+            if os.environ.get("AZURE_API_KEY") is None:
+                credential = DefaultAzureCredential()
+            else:
+                credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
+
         self.endpoint = endpoint
         self.api_version = api_version
         self.doc_intel_client = DocumentIntelligenceClient(
             endpoint=self.endpoint,
             api_version=self.api_version,
-            credential=DefaultAzureCredential(),
+            credential=credential,
         )
 
     def accepts(
@@ -103,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
 
-        if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
+        if extension in _get_file_extensions(self._file_types):
             return True
 
-        for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(self._file_types):
             if mimetype.startswith(prefix):
                 return True
 
@@ -121,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
 
-        if extension in OFFICE_FILE_EXTENSIONS:
+        # Types that don't support ocr
+        no_ocr_types = [
+            DocumentIntelligenceFileType.DOCX,
+            DocumentIntelligenceFileType.PPTX,
+            DocumentIntelligenceFileType.XLSX,
+            DocumentIntelligenceFileType.HTML,
+        ]
+
+        if extension in _get_file_extensions(no_ocr_types):
             return []
 
-        for prefix in OFFICE_MIME_TYPE_PREFIXES:
+        for prefix in _get_mime_type_prefixes(no_ocr_types):
             if mimetype.startswith(prefix):
                 return []