From 97eeed5f325d15ab3f1bc65e6a4f146c4e7e0680 Mon Sep 17 00:00:00 2001 From: KennyZhang1 <90438893+KennyZhang1@users.noreply.github.com> Date: Tue, 11 Feb 2025 19:01:46 -0500 Subject: [PATCH] Doc Intelligence fixes for refactored code (#325) * added priority flag to doc intel converter constructor * fixed analysis features bug for docx --- .../src/markitdown/converters/_doc_intel_converter.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index 835345a..ed8aabf 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -1,4 +1,5 @@ from typing import Any, Union +import re # Azure imports from azure.ai.documentintelligence import DocumentIntelligenceClient @@ -36,6 +37,7 @@ class DocumentIntelligenceConverter(DocumentConverter): api_version=self.api_version, credential=DefaultAzureCredential(), ) + self._priority = priority def convert( self, local_path: str, **kwargs: Any @@ -62,8 +64,8 @@ class DocumentIntelligenceConverter(DocumentConverter): with open(local_path, "rb") as f: file_bytes = f.read() - # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html) - if extension.lower() in [".xlsx", ".pptx", ".html"]: + # Certain document analysis features are not availiable for office filetypes (.xlsx, .pptx, .html, .docx) + if extension.lower() in [".xlsx", ".pptx", ".html", ".docx"]: analysis_features = [] else: analysis_features = [