diff --git a/pyproject.toml b/pyproject.toml index 9c113ad..2a4e203 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,8 @@ dependencies = [ "pathvalidate", "charset-normalizer", "openai", + "azure-ai-documentintelligence", + "azure-identity" ] [project.urls] diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index b6cf963..69e8f0e 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -4,8 +4,8 @@ import argparse import sys from textwrap import dedent -from .__about__ import __version__ -from ._markitdown import MarkItDown, DocumentConverterResult +from __about__ import __version__ +from _markitdown import MarkItDown, DocumentConverterResult def main(): @@ -57,16 +57,37 @@ def main(): "--output", help="Output file name. If not provided, output is written to stdout.", ) + parser.add_argument( + "-d", + "--use-docintel", + action="store_true", + help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.", + ) + parser.add_argument( + "-e", + "--endpoint", + type=str, + help="Document Intelligence Endpoint. Required if using Document Intelligence.", + ) args = parser.parse_args() - if args.filename is None: - markitdown = MarkItDown() - result = markitdown.convert_stream(sys.stdin.buffer) - _handle_output(args, result) + if args.use_docintel: + if args.endpoint is None: + raise ValueError( + "Document Intelligence Endpoint is required when using Document Intelligence." + ) + elif args.filename is None: + raise ValueError("Filename is required when using Document Intelligence.") + markitdown = MarkItDown(docintel_endpoint=args.endpoint) else: markitdown = MarkItDown() + + if args.filename is None: + result = markitdown.convert_stream(sys.stdin.buffer) + else: result = markitdown.convert(args.filename) - _handle_output(args, result) + + _handle_output(args, result) def _handle_output(args, result: DocumentConverterResult): diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e1..ae6a7b4 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -33,6 +33,19 @@ import requests from bs4 import BeautifulSoup from charset_normalizer import from_path +# Azure imports +from azure.ai.documentintelligence import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import ( + AnalyzeDocumentRequest, + AnalyzeResult, + DocumentAnalysisFeature, +) +from azure.identity import DefaultAzureCredential + +# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. +# This constant is a temporary fix until the bug is resolved. +CONTENT_FORMAT = "markdown" + # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: @@ -1318,6 +1331,74 @@ class ZipConverter(DocumentConverter): ) +class DocumentIntelligenceConverter(DocumentConverter): + """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" + + def __init__( + self, + endpoint: str, + api_version: str = "2024-07-31-preview", + ): + self.endpoint = endpoint + self.api_version = api_version + self.doc_intel_client = DocumentIntelligenceClient( + endpoint=self.endpoint, + api_version=self.api_version, + credential=DefaultAzureCredential(), + ) + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if extension is not supported by Document Intelligence + extension = kwargs.get("file_extension", "") + docintel_extensions = [ + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".html", + ".jpeg", + ".jpg", + ".png", + ".bmp", + ".tiff", + ".heif", + ] + if extension.lower() not in docintel_extensions: + return None + + # Get the bytestring for the local path + with open(local_path, "rb") as f: + file_bytes = f.read() + + # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) + if extension.lower() in [".xlsx", ".pptx", ".html"]: + analysis_features = [] + else: + analysis_features = [ + DocumentAnalysisFeature.FORMULAS, # enable formula extraction + DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR + DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction + ] + + # Extract the text using Azure Document Intelligence + poller = self.doc_intel_client.begin_analyze_document( + model_id="prebuilt-layout", + body=AnalyzeDocumentRequest(bytes_source=file_bytes), + features=analysis_features, + output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed + ) + result: AnalyzeResult = poller.result() + + # remove comments from the markdown content generated by Doc Intelligence and append to markdown string + markdown_text = re.sub(r"", "", result.content, flags=re.DOTALL) + return DocumentConverterResult( + title=None, + text_content=markdown_text, + ) + + class FileConversionException(BaseException): pass @@ -1337,6 +1418,7 @@ class MarkItDown: llm_model: Optional[str] = None, style_map: Optional[str] = None, exiftool_path: Optional[str] = None, + docintel_endpoint: Optional[str] = None, # Deprecated mlm_client: Optional[Any] = None, mlm_model: Optional[str] = None, @@ -1406,6 +1488,12 @@ class MarkItDown: self.register_page_converter(ZipConverter()) self.register_page_converter(OutlookMsgConverter()) + # Register Document Intelligence converter at the top of the stack if endpoint is provided + if docintel_endpoint is not None: + self.register_page_converter( + DocumentIntelligenceConverter(endpoint=docintel_endpoint) + ) + def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any ) -> DocumentConverterResult: # TODO: deal with kwargs