Make it easier to use AzureKeyCredentials with Azure Doc Intelligence (#1151)
* Make it easier to use AzureKeyCredentials with Azure Doc Intelligence * Fixed mypy type error. * Added more fine-grained options over types. * Pass doc intel options further up the stack.
This commit is contained in:
@@ -10,7 +10,7 @@ import traceback
|
|||||||
import io
|
import io
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from importlib.metadata import entry_points
|
from importlib.metadata import entry_points
|
||||||
from typing import Any, List, Optional, Union, BinaryIO
|
from typing import Any, List, Dict, Optional, Union, BinaryIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
@@ -198,8 +198,19 @@ class MarkItDown:
|
|||||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||||
if docintel_endpoint is not None:
|
if docintel_endpoint is not None:
|
||||||
|
docintel_args: Dict[str, Any] = {}
|
||||||
|
docintel_args["endpoint"] = docintel_endpoint
|
||||||
|
|
||||||
|
docintel_credential = kwargs.get("docintel_credential")
|
||||||
|
if docintel_credential is not None:
|
||||||
|
docintel_args["credential"] = docintel_credential
|
||||||
|
|
||||||
|
docintel_types = kwargs.get("docintel_file_types")
|
||||||
|
if docintel_types is not None:
|
||||||
|
docintel_args["file_types"] = docintel_types
|
||||||
|
|
||||||
self.register_converter(
|
self.register_converter(
|
||||||
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
|
DocumentIntelligenceConverter(**docintel_args),
|
||||||
)
|
)
|
||||||
|
|
||||||
self._builtins_enabled = True
|
self._builtins_enabled = True
|
||||||
|
|||||||
@@ -17,7 +17,10 @@ from ._image_converter import ImageConverter
|
|||||||
from ._audio_converter import AudioConverter
|
from ._audio_converter import AudioConverter
|
||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import (
|
||||||
|
DocumentIntelligenceConverter,
|
||||||
|
DocumentIntelligenceFileType,
|
||||||
|
)
|
||||||
from ._epub_converter import EpubConverter
|
from ._epub_converter import EpubConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -38,5 +41,6 @@ __all__ = [
|
|||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
"DocumentIntelligenceFileType",
|
||||||
"EpubConverter",
|
"EpubConverter",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
from typing import BinaryIO, Any, List
|
from typing import BinaryIO, Any, List
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
from ._html_converter import HtmlConverter
|
from ._html_converter import HtmlConverter
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
@@ -18,6 +20,7 @@ try:
|
|||||||
AnalyzeResult,
|
AnalyzeResult,
|
||||||
DocumentAnalysisFeature,
|
DocumentAnalysisFeature,
|
||||||
)
|
)
|
||||||
|
from azure.core.credentials import AzureKeyCredential, TokenCredential
|
||||||
from azure.identity import DefaultAzureCredential
|
from azure.identity import DefaultAzureCredential
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Preserve the error and stack trace for later
|
# Preserve the error and stack trace for later
|
||||||
@@ -29,38 +32,74 @@ except ImportError:
|
|||||||
CONTENT_FORMAT = "markdown"
|
CONTENT_FORMAT = "markdown"
|
||||||
|
|
||||||
|
|
||||||
OFFICE_MIME_TYPE_PREFIXES = [
|
class DocumentIntelligenceFileType(str, Enum):
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"""Enum of file types supported by the Document Intelligence Converter."""
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml",
|
|
||||||
"application/xhtml",
|
|
||||||
"text/html",
|
|
||||||
]
|
|
||||||
|
|
||||||
OTHER_MIME_TYPE_PREFIXES = [
|
# No OCR
|
||||||
"application/pdf",
|
DOCX = "docx"
|
||||||
"application/x-pdf",
|
PPTX = "pptx"
|
||||||
"text/html",
|
XLSX = "xlsx"
|
||||||
"image/",
|
HTML = "html"
|
||||||
]
|
# OCR
|
||||||
|
PDF = "pdf"
|
||||||
|
JPEG = "jpeg"
|
||||||
|
PNG = "png"
|
||||||
|
BMP = "bmp"
|
||||||
|
TIFF = "tiff"
|
||||||
|
|
||||||
OFFICE_FILE_EXTENSIONS = [
|
|
||||||
".docx",
|
|
||||||
".xlsx",
|
|
||||||
".pptx",
|
|
||||||
".html",
|
|
||||||
".htm",
|
|
||||||
]
|
|
||||||
|
|
||||||
OTHER_FILE_EXTENSIONS = [
|
def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||||
".pdf",
|
"""Get the MIME type prefixes for the given file types."""
|
||||||
".jpeg",
|
prefixes: List[str] = []
|
||||||
".jpg",
|
for type_ in types:
|
||||||
".png",
|
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||||
".bmp",
|
prefixes.append(
|
||||||
".tiff",
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
".heif",
|
)
|
||||||
]
|
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||||
|
prefixes.append(
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml"
|
||||||
|
)
|
||||||
|
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||||
|
prefixes.append(
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
|
)
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||||
|
prefixes.append("application/pdf")
|
||||||
|
prefixes.append("application/x-pdf")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||||
|
prefixes.append("image/jpeg")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||||
|
prefixes.append("image/png")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||||
|
prefixes.append("image/bmp")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||||
|
prefixes.append("image/tiff")
|
||||||
|
return prefixes
|
||||||
|
|
||||||
|
|
||||||
|
def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||||
|
"""Get the file extensions for the given file types."""
|
||||||
|
extensions: List[str] = []
|
||||||
|
for type_ in types:
|
||||||
|
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||||
|
extensions.append(".docx")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||||
|
extensions.append(".pptx")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||||
|
extensions.append(".xlsx")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||||
|
extensions.append(".pdf")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||||
|
extensions.append(".jpg")
|
||||||
|
extensions.append(".jpeg")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||||
|
extensions.append(".png")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||||
|
extensions.append(".bmp")
|
||||||
|
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||||
|
extensions.append(".tiff")
|
||||||
|
return extensions
|
||||||
|
|
||||||
|
|
||||||
class DocumentIntelligenceConverter(DocumentConverter):
|
class DocumentIntelligenceConverter(DocumentConverter):
|
||||||
@@ -71,8 +110,30 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
*,
|
*,
|
||||||
endpoint: str,
|
endpoint: str,
|
||||||
api_version: str = "2024-07-31-preview",
|
api_version: str = "2024-07-31-preview",
|
||||||
|
credential: AzureKeyCredential | TokenCredential | None = None,
|
||||||
|
file_types: List[DocumentIntelligenceFileType] = [
|
||||||
|
DocumentIntelligenceFileType.DOCX,
|
||||||
|
DocumentIntelligenceFileType.PPTX,
|
||||||
|
DocumentIntelligenceFileType.XLSX,
|
||||||
|
DocumentIntelligenceFileType.PDF,
|
||||||
|
DocumentIntelligenceFileType.JPEG,
|
||||||
|
DocumentIntelligenceFileType.PNG,
|
||||||
|
DocumentIntelligenceFileType.BMP,
|
||||||
|
DocumentIntelligenceFileType.TIFF,
|
||||||
|
],
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Initialize the DocumentIntelligenceConverter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
endpoint (str): The endpoint for the Document Intelligence service.
|
||||||
|
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
|
||||||
|
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
|
||||||
|
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
|
||||||
|
"""
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self._file_types = file_types
|
||||||
|
|
||||||
# Raise an error if the dependencies are not available.
|
# Raise an error if the dependencies are not available.
|
||||||
# This is different than other converters since this one isn't even instantiated
|
# This is different than other converters since this one isn't even instantiated
|
||||||
@@ -86,12 +147,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
_dependency_exc_info[2]
|
_dependency_exc_info[2]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if credential is None:
|
||||||
|
if os.environ.get("AZURE_API_KEY") is None:
|
||||||
|
credential = DefaultAzureCredential()
|
||||||
|
else:
|
||||||
|
credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
|
||||||
|
|
||||||
self.endpoint = endpoint
|
self.endpoint = endpoint
|
||||||
self.api_version = api_version
|
self.api_version = api_version
|
||||||
self.doc_intel_client = DocumentIntelligenceClient(
|
self.doc_intel_client = DocumentIntelligenceClient(
|
||||||
endpoint=self.endpoint,
|
endpoint=self.endpoint,
|
||||||
api_version=self.api_version,
|
api_version=self.api_version,
|
||||||
credential=DefaultAzureCredential(),
|
credential=credential,
|
||||||
)
|
)
|
||||||
|
|
||||||
def accepts(
|
def accepts(
|
||||||
@@ -103,10 +170,10 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
if extension in OFFICE_FILE_EXTENSIONS + OTHER_FILE_EXTENSIONS:
|
if extension in _get_file_extensions(self._file_types):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
for prefix in OFFICE_MIME_TYPE_PREFIXES + OTHER_MIME_TYPE_PREFIXES:
|
for prefix in _get_mime_type_prefixes(self._file_types):
|
||||||
if mimetype.startswith(prefix):
|
if mimetype.startswith(prefix):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -121,10 +188,18 @@ class DocumentIntelligenceConverter(DocumentConverter):
|
|||||||
mimetype = (stream_info.mimetype or "").lower()
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
extension = (stream_info.extension or "").lower()
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
if extension in OFFICE_FILE_EXTENSIONS:
|
# Types that don't support ocr
|
||||||
|
no_ocr_types = [
|
||||||
|
DocumentIntelligenceFileType.DOCX,
|
||||||
|
DocumentIntelligenceFileType.PPTX,
|
||||||
|
DocumentIntelligenceFileType.XLSX,
|
||||||
|
DocumentIntelligenceFileType.HTML,
|
||||||
|
]
|
||||||
|
|
||||||
|
if extension in _get_file_extensions(no_ocr_types):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for prefix in OFFICE_MIME_TYPE_PREFIXES:
|
for prefix in _get_mime_type_prefixes(no_ocr_types):
|
||||||
if mimetype.startswith(prefix):
|
if mimetype.startswith(prefix):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user