Add support for conversion via Document Intelligence (#303)

* added cli params for doc intel * added DocumentIntelligenceConverter class implementation * initialized doc intel client instance field * added isolated doc_intel main conversion function * temp fix for ContentFormat import bug * ran tests for docintel and offline for many filetypes * push doc intel converter to the top of the stack * formatting changes * modified project toml file
2025-01-24 17:09:32 -05:00
parent f58a864951
commit bfde857420
3 changed files with 118 additions and 7 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,8 @@ dependencies = [
  "pathvalidate",
  "charset-normalizer",
  "openai",
+  "azure-ai-documentintelligence",
+  "azure-identity"
 ]

 [project.urls]
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -4,8 +4,8 @@
 import argparse
 import sys
 from textwrap import dedent
-from .__about__ import __version__
-from ._markitdown import MarkItDown, DocumentConverterResult
+from __about__ import __version__
+from _markitdown import MarkItDown, DocumentConverterResult


 def main():
@@ -57,16 +57,37 @@ def main():
        "--output",
        help="Output file name. If not provided, output is written to stdout.",
    )
+    parser.add_argument(
+        "-d",
+        "--use-docintel",
+        action="store_true",
+        help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
+    )
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        type=str,
+        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
+    )
    args = parser.parse_args()

-    if args.filename is None:
-        markitdown = MarkItDown()
-        result = markitdown.convert_stream(sys.stdin.buffer)
-        _handle_output(args, result)
+    if args.use_docintel:
+        if args.endpoint is None:
+            raise ValueError(
+                "Document Intelligence Endpoint is required when using Document Intelligence."
+            )
+        elif args.filename is None:
+            raise ValueError("Filename is required when using Document Intelligence.")
+        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
    else:
        markitdown = MarkItDown()
+
+    if args.filename is None:
+        result = markitdown.convert_stream(sys.stdin.buffer)
+    else:
        result = markitdown.convert(args.filename)
-        _handle_output(args, result)
+
+    _handle_output(args, result)


 def _handle_output(args, result: DocumentConverterResult):
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -33,6 +33,19 @@ import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path

+# Azure imports
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import (
+    AnalyzeDocumentRequest,
+    AnalyzeResult,
+    DocumentAnalysisFeature,
+)
+from azure.identity import DefaultAzureCredential
+
+# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
+# This constant is a temporary fix until the bug is resolved.
+CONTENT_FORMAT = "markdown"
+
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
@@ -1318,6 +1331,74 @@ class ZipConverter(DocumentConverter):
            )


+class DocumentIntelligenceConverter(DocumentConverter):
+    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        api_version: str = "2024-07-31-preview",
+    ):
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.doc_intel_client = DocumentIntelligenceClient(
+            endpoint=self.endpoint,
+            api_version=self.api_version,
+            credential=DefaultAzureCredential(),
+        )
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if extension is not supported by Document Intelligence
+        extension = kwargs.get("file_extension", "")
+        docintel_extensions = [
+            ".pdf",
+            ".docx",
+            ".xlsx",
+            ".pptx",
+            ".html",
+            ".jpeg",
+            ".jpg",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".heif",
+        ]
+        if extension.lower() not in docintel_extensions:
+            return None
+
+        # Get the bytestring for the local path
+        with open(local_path, "rb") as f:
+            file_bytes = f.read()
+
+        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
+        if extension.lower() in [".xlsx", ".pptx", ".html"]:
+            analysis_features = []
+        else:
+            analysis_features = [
+                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
+                DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
+            ]
+
+        # Extract the text using Azure Document Intelligence
+        poller = self.doc_intel_client.begin_analyze_document(
+            model_id="prebuilt-layout",
+            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            features=analysis_features,
+            output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+        )
+        result: AnalyzeResult = poller.result()
+
+        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
+        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown_text,
+        )
+
+
 class FileConversionException(BaseException):
    pass

@@ -1337,6 +1418,7 @@ class MarkItDown:
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
        exiftool_path: Optional[str] = None,
+        docintel_endpoint: Optional[str] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
        mlm_model: Optional[str] = None,
@@ -1406,6 +1488,12 @@ class MarkItDown:
        self.register_page_converter(ZipConverter())
        self.register_page_converter(OutlookMsgConverter())

+        # Register Document Intelligence converter at the top of the stack if endpoint is provided
+        if docintel_endpoint is not None:
+            self.register_page_converter(
+                DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+            )
+
    def convert(
        self, source: Union[str, requests.Response, Path], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs