Added missing comma.

Removed deprecation and other warnings. (#1105 )
Addresses #1068 (#1101 )
2025-03-07 16:18:47 -08:00 · 2025-03-07 16:17:03 -08:00 · 2025-03-07 15:46:30 -08:00 · 2025-03-05 21:35:08 -08:00 · 2025-03-05 21:30:56 -08:00 · 2025-02-28 07:30:46 -08:00
6 changed files with 249 additions and 152 deletions
--- a/README.md
+++ b/README.md
@@ -33,12 +33,20 @@ Or use `-o` to specify the output file:
 markitdown path-to-file.pdf -o document.md
 ```

+To use Document Intelligence conversion:
+
+```bash
+markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
+```
+
 You can also pipe content:

 ```bash
 cat path-to-file.pdf | markitdown
 ```

+More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
+
 ### Python API

 Basic usage in Python:
@@ -51,6 +59,16 @@ result = md.convert("test.xlsx")
 print(result.text_content)
 ```

+Document Intelligence conversion in Python:
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
+result = md.convert("test.pdf")
+print(result.text_content)
+```
+
 To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:

 ```python
@@ -69,42 +87,6 @@ print(result.text_content)
 docker build -t markitdown:latest .
 docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 ```
-<details>
-    
-<summary>Batch Processing Multiple Files</summary>
-
-This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
-
-
-```python convert.py
-from markitdown import MarkItDown
-from openai import OpenAI
-import os
-client = OpenAI(api_key="your-api-key-here")
-md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
-supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
-files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
-for file in files_to_convert:
-    print(f"\nConverting {file}...")
-    try:
-        md_file = os.path.splitext(file)[0] + '.md'
-        result = md.convert(file)
-        with open(md_file, 'w') as f:
-            f.write(result.text_content)
-        
-        print(f"Successfully converted {file} to {md_file}")
-    except Exception as e:
-        print(f"Error converting {file}: {str(e)}")
-
-print("\nAll conversions completed!")
-```
-2. Place the script in the same directory as your files
-3. Install required packages: like openai
-4. Run script ```bash python convert.py ```
-
-Note that original files will remain unchanged and new markdown files are created with the same base name.
-
-</details>
   
 ## Contributing

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,8 @@ dependencies = [
  "pathvalidate",
  "charset-normalizer",
  "openai",
+  "azure-ai-documentintelligence",
+  "azure-identity"
 ]

 [project.urls]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a3"
+__version__ = "0.0.2"
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -51,22 +51,46 @@ def main():
        help="show the version number and exit",
    )

-    parser.add_argument("filename", nargs="?")
    parser.add_argument(
        "-o",
        "--output",
        help="Output file name. If not provided, output is written to stdout.",
    )
+
+    parser.add_argument(
+        "-d",
+        "--use-docintel",
+        action="store_true",
+        help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
+    )
+
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        type=str,
+        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
+    )
+
+    parser.add_argument("filename", nargs="?")
    args = parser.parse_args()

-    if args.filename is None:
-        markitdown = MarkItDown()
-        result = markitdown.convert_stream(sys.stdin.buffer)
-        _handle_output(args, result)
+    if args.use_docintel:
+        if args.endpoint is None:
+            raise ValueError(
+                "Document Intelligence Endpoint is required when using Document Intelligence."
+            )
+        elif args.filename is None:
+            raise ValueError("Filename is required when using Document Intelligence.")
+        markitdown = MarkItDown(docintel_endpoint=args.endpoint)
    else:
        markitdown = MarkItDown()
+
+    if args.filename is None:
+        result = markitdown.convert_stream(sys.stdin.buffer)
+    else:
        result = markitdown.convert(args.filename)
-        _handle_output(args, result)
+
+    _handle_output(args, result)


 def _handle_output(args, result: DocumentConverterResult):
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -17,7 +17,7 @@ from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
 from pathlib import Path
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
-from warnings import warn, resetwarnings, catch_warnings
+from warnings import warn, filterwarnings

 import mammoth
 import markdownify
@@ -33,23 +33,32 @@ import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path

+# Azure imports
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import (
+    AnalyzeDocumentRequest,
+    AnalyzeResult,
+    DocumentAnalysisFeature,
+)
+from azure.identity import DefaultAzureCredential
+
+# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
+# This constant is a temporary fix until the bug is resolved.
+CONTENT_FORMAT = "markdown"
+
+# Override mimetype for csv to fix issue on windows
+mimetypes.add_type("text/csv", ".csv")
+
 # Optional Transcription support
 IS_AUDIO_TRANSCRIPTION_CAPABLE = False
+filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
 try:
-    # Using warnings' catch_warnings to catch
-    # pydub's warning of ffmpeg or avconv missing
-    with catch_warnings(record=True) as w:
-        import pydub
-
-        if w:
-            raise ModuleNotFoundError
+    import pydub
    import speech_recognition as sr

    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
 except ModuleNotFoundError:
    pass
-finally:
-    resetwarnings()

 # Optional YouTube transcription support
 try:
@@ -75,7 +84,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)

-    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
+    def convert_hn(
+        self,
+        n: int,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ) -> str:
        """Same as usual, but be sure to start with a new line"""
        if not convert_as_inline:
            if not re.search(r"^\n", text):
@@ -83,7 +99,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore

-    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
+    def convert_a(
+        self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
+    ):
        """Same as usual converter, but removes Javascript links and escapes URIs."""
        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
        if not text:
@@ -119,7 +137,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            else text
        )

-    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
+    def convert_img(
+        self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
+    ) -> str:
        """Same as usual converter, but removes data URIs"""

        alt = el.attrs.get("alt", None) or ""
@@ -204,7 +224,7 @@ class HtmlConverter(DocumentConverter):
        return result

    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
-        """Helper function that converts and HTML string."""
+        """Helper function that converts an HTML string."""

        # Parse the string
        soup = BeautifulSoup(html_content, "html.parser")
@@ -223,6 +243,9 @@ class HtmlConverter(DocumentConverter):

        assert isinstance(webpage_text, str)

+        # remove leading and trailing \n
+        webpage_text = webpage_text.strip()
+
        return DocumentConverterResult(
            title=None if soup.title is None else soup.title.string,
            text_content=webpage_text,
@@ -771,6 +794,35 @@ class PptxConverter(HtmlConverter):
    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
    """

+    def _get_llm_description(
+        self, llm_client, llm_model, image_blob, content_type, prompt=None
+    ):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed alt text for this image with less than 50 words."
+
+        image_base64 = base64.b64encode(image_blob).decode("utf-8")
+        data_uri = f"data:{content_type};base64,{image_base64}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+
+        response = llm_client.chat.completions.create(
+            model=llm_model, messages=messages
+        )
+        return response.choices[0].message.content
+
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
        # Bail if not a PPTX
        extension = kwargs.get("file_extension", "")
@@ -791,17 +843,38 @@ class PptxConverter(HtmlConverter):
                # Pictures
                if self._is_picture(shape):
                    # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
-                    alt_text = ""
-                    try:
-                        alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
-                    except Exception:
-                        pass
+
+                    llm_description = None
+                    alt_text = None
+
+                    llm_client = kwargs.get("llm_client")
+                    llm_model = kwargs.get("llm_model")
+                    if llm_client is not None and llm_model is not None:
+                        try:
+                            llm_description = self._get_llm_description(
+                                llm_client,
+                                llm_model,
+                                shape.image.blob,
+                                shape.image.content_type,
+                            )
+                        except Exception:
+                            # Unable to describe with LLM
+                            pass
+
+                    if not llm_description:
+                        try:
+                            alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                                "descr", ""
+                            )
+                        except Exception:
+                            # Unable to get alt text
+                            pass

                    # A placeholder name
                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
                    md_content += (
                        "\n!["
-                        + (alt_text if alt_text else shape.name)
+                        + (llm_description or alt_text or shape.name)
                        + "]("
                        + filename
                        + ")\n"
@@ -894,18 +967,6 @@ class MediaConverter(DocumentConverter):

    def _get_metadata(self, local_path, exiftool_path=None):
        if not exiftool_path:
-            which_exiftool = shutil.which("exiftool")
-            if which_exiftool:
-                warn(
-                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
-
-    md = MarkItDown(exiftool_path="{which_exiftool}")
-
-This warning will be removed in future releases.
-""",
-                    DeprecationWarning,
-                )
-
            return None
        else:
            try:
@@ -1008,6 +1069,14 @@ class Mp3Converter(WavConverter):
            handle, temp_path = tempfile.mkstemp(suffix=".wav")
            os.close(handle)
            try:
+                # Check if pydub defaulted to ffmpeg
+                if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
+                    "ffmpeg"
+                ):
+                    warn(
+                        "pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
+                        RuntimeWarning,
+                    )
                sound = pydub.AudioSegment.from_mp3(local_path)
                sound.export(temp_path, format="wav")

@@ -1318,6 +1387,74 @@ class ZipConverter(DocumentConverter):
            )


+class DocumentIntelligenceConverter(DocumentConverter):
+    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        api_version: str = "2024-07-31-preview",
+    ):
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.doc_intel_client = DocumentIntelligenceClient(
+            endpoint=self.endpoint,
+            api_version=self.api_version,
+            credential=DefaultAzureCredential(),
+        )
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if extension is not supported by Document Intelligence
+        extension = kwargs.get("file_extension", "")
+        docintel_extensions = [
+            ".pdf",
+            ".docx",
+            ".xlsx",
+            ".pptx",
+            ".html",
+            ".jpeg",
+            ".jpg",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".heif",
+        ]
+        if extension.lower() not in docintel_extensions:
+            return None
+
+        # Get the bytestring for the local path
+        with open(local_path, "rb") as f:
+            file_bytes = f.read()
+
+        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
+        if extension.lower() in [".xlsx", ".pptx", ".html"]:
+            analysis_features = []
+        else:
+            analysis_features = [
+                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
+                DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
+            ]
+
+        # Extract the text using Azure Document Intelligence
+        poller = self.doc_intel_client.begin_analyze_document(
+            model_id="prebuilt-layout",
+            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            features=analysis_features,
+            output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+        )
+        result: AnalyzeResult = poller.result()
+
+        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
+        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown_text,
+        )
+
+
 class FileConversionException(BaseException):
    pass

@@ -1337,6 +1474,7 @@ class MarkItDown:
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
        exiftool_path: Optional[str] = None,
+        docintel_endpoint: Optional[str] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
        mlm_model: Optional[str] = None,
@@ -1349,34 +1487,26 @@ class MarkItDown:
        if exiftool_path is None:
            exiftool_path = os.environ.get("EXIFTOOL_PATH")

-        # Handle deprecation notices
-        #############################
-        if mlm_client is not None:
-            if llm_client is None:
-                warn(
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
-                    DeprecationWarning,
-                )
-                llm_client = mlm_client
-                mlm_client = None
-            else:
-                raise ValueError(
-                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
-                )
-
-        if mlm_model is not None:
-            if llm_model is None:
-                warn(
-                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
-                    DeprecationWarning,
-                )
-                llm_model = mlm_model
-                mlm_model = None
-            else:
-                raise ValueError(
-                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
-                )
-        #############################
+        # Still none? Check well-known paths
+        if exiftool_path is None:
+            candidate = shutil.which("exiftool")
+            if candidate:
+                candidate = os.path.abspath(candidate)
+                if any(
+                    d == os.path.dirname(candidate)
+                    for d in [
+                        "/usr/bin",
+                        "/usr/local/bin",
+                        "/opt",
+                        "/opt/bin",
+                        "/opt/local/bin",
+                        "/opt/homebrew/bin",
+                        "C:\\Windows\\System32",
+                        "C:\\Program Files",
+                        "C:\\Program Files (x86)",
+                    ]
+                ):
+                    exiftool_path = candidate

        self._llm_client = llm_client
        self._llm_model = llm_model
@@ -1406,6 +1536,12 @@ class MarkItDown:
        self.register_page_converter(ZipConverter())
        self.register_page_converter(OutlookMsgConverter())

+        # Register Document Intelligence converter at the top of the stack if endpoint is provided
+        if docintel_endpoint is not None:
+            self.register_page_converter(
+                DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+            )
+
    def convert(
        self, source: Union[str, requests.Response, Path], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
@@ -1608,6 +1744,8 @@ class MarkItDown:
        ext = ext.strip()
        if ext == "":
            return
+        if ext in extensions:
+            return
        # if ext not in extensions:
        extensions.append(ext)

--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -6,8 +6,6 @@ import shutil
 import pytest
 import requests

-from warnings import catch_warnings, resetwarnings
-
 from markitdown import MarkItDown

 skip_remote = (
@@ -277,18 +275,6 @@ def test_markitdown_local() -> None:
    reason="do not run if exiftool is not installed",
 )
 def test_markitdown_exiftool() -> None:
-    # Test the automatic discovery of exiftool throws a warning
-    # and is disabled
-    try:
-        with catch_warnings(record=True) as w:
-            markitdown = MarkItDown()
-            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
-            assert len(w) == 1
-            assert w[0].category is DeprecationWarning
-            assert result.text_content.strip() == ""
-    finally:
-        resetwarnings()
-
    # Test explicitly setting the location of exiftool
    which_exiftool = shutil.which("exiftool")
    markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -306,40 +292,6 @@ def test_markitdown_exiftool() -> None:
        assert target in result.text_content


-def test_markitdown_deprecation() -> None:
-    try:
-        with catch_warnings(record=True) as w:
-            test_client = object()
-            markitdown = MarkItDown(mlm_client=test_client)
-            assert len(w) == 1
-            assert w[0].category is DeprecationWarning
-            assert markitdown._llm_client == test_client
-    finally:
-        resetwarnings()
-
-    try:
-        with catch_warnings(record=True) as w:
-            markitdown = MarkItDown(mlm_model="gpt-4o")
-            assert len(w) == 1
-            assert w[0].category is DeprecationWarning
-            assert markitdown._llm_model == "gpt-4o"
-    finally:
-        resetwarnings()
-
-    try:
-        test_client = object()
-        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
-        assert False
-    except ValueError:
-        pass
-
-    try:
-        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
-        assert False
-    except ValueError:
-        pass
-
-
@pytest.mark.skipif(
    skip_llm,
    reason="do not run llm tests without a key",
@@ -364,5 +316,4 @@ if __name__ == "__main__":
    # test_markitdown_remote()
    # test_markitdown_local()
    test_markitdown_exiftool()
-    # test_markitdown_deprecation()
    # test_markitdown_llm()
Author	SHA1	Message	Date
Adam Fourney	e58bc486ee	Added missing comma.	2025-03-07 16:18:47 -08:00
afourney	81ef601c09	Removed deprecation and other warnings. (#1105 )	2025-03-07 16:17:03 -08:00
afourney	518b12c1fb	Addresses #1068 (#1101 )	2025-03-07 15:46:30 -08:00
Adam Fourney	8eaf5a1da9	Clean up README.md	2025-03-05 21:35:08 -08:00
afourney	38c924793c	Bump version (#1095 )	2025-03-05 21:30:56 -08:00
afourney	b9526d5e47	Bump version. (#1075 )	2025-02-28 07:30:46 -08:00
Hieu Lam	519fe172aa	Unable to convert HTML to Markdown (#1072 ) * feat: issue where inherited function from `markdownify.MarkdownConverter` doesn't have `current_tags` leading to error using `kwargs`, also set default value for `convert_as_inline`	2025-02-28 00:57:41 -08:00
Adam Fourney	abe9752438	Bumped version	2025-02-10 16:01:17 -08:00
wunde005	73ba69d8cd	For csv files mimetypes.guess_type is returning "application/vnd.ms-excel" on windows causing an invalid mime type in plaintextconverter. In reference to issue: https://github.com/microsoft/markitdown/issues/150 (#273 )	2025-02-08 20:58:13 -08:00
Werner Robitza	2a4f7bb6a8	fix: argparse CLI option ordering, fixes #268 (#290 ) * fix: argparse CLI option ordering, fixes #268 * Fixed formatting.	2025-02-08 20:50:38 -08:00
masquare	7cf5e0bb23	feat(pptx): support image description with LLM for pptx files (#306 )	2025-02-08 20:37:34 -08:00
James Hickey	3090917a49	Typo fixed (#270 )	2025-02-08 20:30:13 -08:00
ZeyuTeng96	7bea2672a0	remove leading and trailing \n for HtmlConverter (#262 )	2025-02-08 20:28:35 -08:00
KennyZhang1	bf6a15e9b5	Kennyzhang/docintel docs (#312 ) * updated docs to include doc intelligence * include reference to doc intel setup docs	2025-01-31 22:23:26 -08:00
KennyZhang1	bfde857420	Add support for conversion via Document Intelligence (#303 ) * added cli params for doc intel * added DocumentIntelligenceConverter class implementation * initialized doc intel client instance field * added isolated doc_intel main conversion function * temp fix for ContentFormat import bug * ran tests for docintel and offline for many filetypes * push doc intel converter to the top of the stack * formatting changes * modified project toml file	2025-01-24 14:09:32 -08:00