From 9e067c42b647eaf14e842e70e47540b36c0c4a08 Mon Sep 17 00:00:00 2001 From: afourney Date: Wed, 26 Mar 2025 10:44:11 -0700 Subject: [PATCH] Make it easier to use AzureKeyCredentials with Azure Doc Intelligence (#1151) * Make it easier to use AzureKeyCredentials with Azure Doc Intelligence * Fixed mypy type error. * Added more fine-grained options over types. * Pass doc intel options further up the stack. --- .../markitdown/src/markitdown/_markitdown.py | 15 +- .../src/markitdown/converters/__init__.py | 6 +- .../converters/_doc_intel_converter.py | 143 +++++++++++++----- 3 files changed, 127 insertions(+), 37 deletions(-) diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 8f58db4..54a0dc8 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -10,7 +10,7 @@ import traceback import io from dataclasses import dataclass from importlib.metadata import entry_points -from typing import Any, List, Optional, Union, BinaryIO +from typing import Any, List, Dict, Optional, Union, BinaryIO from pathlib import Path from urllib.parse import urlparse from warnings import warn @@ -198,8 +198,19 @@ class MarkItDown: # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") if docintel_endpoint is not None: + docintel_args: Dict[str, Any] = {} + docintel_args["endpoint"] = docintel_endpoint + + docintel_credential = kwargs.get("docintel_credential") + if docintel_credential is not None: + docintel_args["credential"] = docintel_credential + + docintel_types = kwargs.get("docintel_file_types") + if docintel_types is not None: + docintel_args["file_types"] = docintel_types + self.register_converter( - DocumentIntelligenceConverter(endpoint=docintel_endpoint) + DocumentIntelligenceConverter(**docintel_args), ) self._builtins_enabled = True diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index 09e3cb1..c68d0c3 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -17,7 +17,10 @@ from ._image_converter import ImageConverter from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter -from ._doc_intel_converter import DocumentIntelligenceConverter +from ._doc_intel_converter import ( + DocumentIntelligenceConverter, + DocumentIntelligenceFileType, +) from ._epub_converter import EpubConverter __all__ = [ @@ -38,5 +41,6 @@ __all__ = [ "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", + "DocumentIntelligenceFileType", "EpubConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 2f116d0..5f4069b 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -1,7 +1,9 @@ import sys import re +import os from typing import BinaryIO, Any, List +from enum import Enum from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult @@ -18,6 +20,7 @@ try: AnalyzeResult, DocumentAnalysisFeature, ) + from azure.core.credentials import AzureKeyCredential, TokenCredential from azure.identity import DefaultAzureCredential except ImportError: # Preserve the error and stack trace for later @@ -29,38 +32,74 @@ except ImportError: CONTENT_FORMAT = "markdown" -OFFICE_MIME_TYPE_PREFIXES = [ - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "application/vnd.openxmlformats-officedocument.presentationml", - "application/xhtml", - "text/html", -] +class DocumentIntelligenceFileType(str, Enum): + """Enum of file types supported by the Document Intelligence Converter.""" -OTHER_MIME_TYPE_PREFIXES = [ - "application/pdf", - "application/x-pdf", - "text/html", - "image/", -] + # No OCR + DOCX = "docx" + PPTX = "pptx" + XLSX = "xlsx" + HTML = "html" + # OCR + PDF = "pdf" + JPEG = "jpeg" + PNG = "png" + BMP = "bmp" + TIFF = "tiff" -OFFICE_FILE_EXTENSIONS = [ - ".docx", - ".xlsx", - ".pptx", - ".html", - ".htm", -] -OTHER_FILE_EXTENSIONS = [ - ".pdf", - ".jpeg", - ".jpg", - ".png", - ".bmp", - ".tiff", - ".heif", -] +def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]: + """Get the MIME type prefixes for the given file types.""" + prefixes: List[str] = [] + for type_ in types: + if type_ == DocumentIntelligenceFileType.DOCX: + prefixes.append( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + elif type_ == DocumentIntelligenceFileType.PPTX: + prefixes.append( + "application/vnd.openxmlformats-officedocument.presentationml" + ) + elif type_ == DocumentIntelligenceFileType.XLSX: + prefixes.append( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + elif type_ == DocumentIntelligenceFileType.PDF: + prefixes.append("application/pdf") + prefixes.append("application/x-pdf") + elif type_ == DocumentIntelligenceFileType.JPEG: + prefixes.append("image/jpeg") + elif type_ == DocumentIntelligenceFileType.PNG: + prefixes.append("image/png") + elif type_ == DocumentIntelligenceFileType.BMP: + prefixes.append("image/bmp") + elif type_ == DocumentIntelligenceFileType.TIFF: + prefixes.append("image/tiff") + return prefixes + + +def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]: + """Get the file extensions for the given file types.""" + extensions: List[str] = [] + for type_ in types: + if type_ == DocumentIntelligenceFileType.DOCX: + extensions.append(".docx") + elif type_ == DocumentIntelligenceFileType.PPTX: + extensions.append(".pptx") + elif type_ == DocumentIntelligenceFileType.XLSX: + extensions.append(".xlsx") + elif type_ == DocumentIntelligenceFileType.PDF: + extensions.append(".pdf") + elif type_ == DocumentIntelligenceFileType.JPEG: + extensions.append(".jpg") + extensions.append(".jpeg") + elif type_ == DocumentIntelligenceFileType.PNG: + extensions.append(".png") + elif type_ == DocumentIntelligenceFileType.BMP: + extensions.append(".bmp") + elif type_ == DocumentIntelligenceFileType.TIFF: + extensions.append(".tiff") + return extensions class DocumentIntelligenceConverter(DocumentConverter): @@ -71,8 +110,30 @@ class DocumentIntelligenceConverter(DocumentConverter): *, endpoint: str, api_version: str = "2024-07-31-preview", + credential: AzureKeyCredential | TokenCredential | None = None, + file_types: List[DocumentIntelligenceFileType] = [ + DocumentIntelligenceFileType.DOCX, + DocumentIntelligenceFileType.PPTX, + DocumentIntelligenceFileType.XLSX, + DocumentIntelligenceFileType.PDF, + DocumentIntelligenceFileType.JPEG, + DocumentIntelligenceFileType.PNG, + DocumentIntelligenceFileType.BMP, + DocumentIntelligenceFileType.TIFF, + ], ): + """ + Initialize the DocumentIntelligenceConverter. + + Args: + endpoint (str): The endpoint for the Document Intelligence service. + api_version (str): The API version to use. Defaults to "2024-07-31-preview". + credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication. + file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types. + """ + super().__init__() + self._file_types = file_types # Raise an error if the dependencies are not available. # This is different than other converters since this one isn't even instantiated @@ -86,12 +147,18 @@ class DocumentIntelligenceConverter(DocumentConverter): _dependency_exc_info[2] ) + if credential is None: + if os.environ.get("AZURE_API_KEY") is None: + credential = DefaultAzureCredential() + else: + credential = AzureKeyCredential(os.environ["AZURE_API_KEY"]) + self.endpoint = endpoint self.api_version = api_version self.doc_intel_client = DocumentIntelligenceClient( endpoint=self.endpoint, api_version=self.api_version, - credential=DefaultAzureCredential(), + credential=credential, ) def accepts( @@ -103,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter): mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() - if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS: + if extension in _get_file_extensions(self._file_types): return True - for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES: + for prefix in _get_mime_type_prefixes(self._file_types): if mimetype.startswith(prefix): return True @@ -121,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter): mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() - if extension in OFFICE_FILE_EXTENSIONS: + # Types that don't support ocr + no_ocr_types = [ + DocumentIntelligenceFileType.DOCX, + DocumentIntelligenceFileType.PPTX, + DocumentIntelligenceFileType.XLSX, + DocumentIntelligenceFileType.HTML, + ] + + if extension in _get_file_extensions(no_ocr_types): return [] - for prefix in OFFICE_MIME_TYPE_PREFIXES: + for prefix in _get_mime_type_prefixes(no_ocr_types): if mimetype.startswith(prefix): return []