Bump version.

Unable to convert HTML to Markdown (#1072 )
* feat: issue where inherited function from `markdownify.MarkdownConverter` doesn't have `current_tags` leading to error using `kwargs`, also set default value for `convert_as_inline`
2025-02-28 07:29:12 -08:00 · 2025-02-28 00:57:41 -08:00 · 2025-02-10 16:01:17 -08:00 · 2025-02-08 20:58:13 -08:00 · 2025-02-08 20:50:38 -08:00 · 2025-02-08 20:37:34 -08:00
28 changed files with 1395 additions and 120 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,32 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
+{
+	"name": "Existing Dockerfile",
+	"build": {
+		// Sets the run context to one level up instead of the .devcontainer folder.
+		"context": "..",
+		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
+		"dockerfile": "../Dockerfile",
+		"args": {
+			"INSTALL_GIT": "true"
+		}
+	},
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+	"features": {
+		"ghcr.io/devcontainers-extra/features/hatch:2": {}
+	},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Uncomment the next line to run commands after the container is created.
+	// "postCreateCommand": "cat /etc/os-release",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+	"remoteUser": "root"
+}
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+*
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+tests/test_files/** linguist-vendored
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,9 +5,9 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
      - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
        with:
          python-version: "3.x"

--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -5,8 +5,8 @@ jobs:
  tests:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: |
            3.10
@@ -14,7 +14,7 @@ jobs:
            3.12
      - name: Set up pip cache
        if: runner.os == 'Linux'
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: ~/.cache/pip
          key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.vscode
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -160,3 +162,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+src/.DS_Store
+.DS_Store
--- a/23
+++ b/23
@@ -0,0 +1,23 @@
+FROM python:3.13-slim-bullseye
+
+USER root
+
+ARG INSTALL_GIT=false
+RUN if [ "$INSTALL_GIT" = "true" ]; then \
+    apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
+    fi
+
+# Runtime dependency
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install markitdown
+
+# Default USERID and GROUPID
+ARG USERID=10000
+ARG GROUPID=10000
+
+USER $USERID:$GROUPID
+
+ENTRYPOINT [ "markitdown" ]
--- a/README.md
+++ b/README.md
@@ -1,28 +1,129 @@
 # MarkItDown

-The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
+[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
+![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
+[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)

-It presently supports:

- PDF (.pdf)
- PowerPoint (.pptx)
- Word (.docx)
- Excel (.xlsx)
- Images (EXIF metadata, and OCR)
- Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.)
+MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
+It supports:
+- PDF
+- PowerPoint
+- Word
+- Excel
+- Images (EXIF metadata and OCR)
+- Audio (EXIF metadata and speech transcription)
+- HTML
+- Text-based formats (CSV, JSON, XML)
+- ZIP files (iterates over contents)

-The API is simple:
+To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
+
+## Usage
+
+### Command-Line
+
+```bash
+markitdown path-to-file.pdf > document.md
+```
+
+Or use `-o` to specify the output file:
+
+```bash
+markitdown path-to-file.pdf -o document.md
+```
+
+To use Document Intelligence conversion:
+
+```bash
+markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
+```
+
+You can also pipe content:
+
+```bash
+cat path-to-file.pdf | markitdown
+```
+
+More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
+
+### Python API
+
+Basic usage in Python:

 ```python
 from markitdown import MarkItDown

-markitdown = MarkItDown()
-result = markitdown.convert("test.xlsx")
+md = MarkItDown()
+result = md.convert("test.xlsx")
 print(result.text_content)
 ```

+Document Intelligence conversion in Python:
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
+result = md.convert("test.pdf")
+print(result.text_content)
+```
+
+To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
+
+```python
+from markitdown import MarkItDown
+from openai import OpenAI
+
+client = OpenAI()
+md = MarkItDown(llm_client=client, llm_model="gpt-4o")
+result = md.convert("example.jpg")
+print(result.text_content)
+```
+
+### Docker
+
+```sh
+docker build -t markitdown:latest .
+docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
+```
+<details>
+    
+<summary>Batch Processing Multiple Files</summary>
+
+This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
+
+
+```python convert.py
+from markitdown import MarkItDown
+from openai import OpenAI
+import os
+client = OpenAI(api_key="your-api-key-here")
+md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
+supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
+files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
+for file in files_to_convert:
+    print(f"\nConverting {file}...")
+    try:
+        md_file = os.path.splitext(file)[0] + '.md'
+        result = md.convert(file)
+        with open(md_file, 'w') as f:
+            f.write(result.text_content)
+        
+        print(f"Successfully converted {file} to {md_file}")
+    except Exception as e:
+        print(f"Error converting {file}: {str(e)}")
+
+print("\nAll conversions completed!")
+```
+2. Place the script in the same directory as your files
+3. Install required packages: like openai
+4. Run script ```bash python convert.py ```
+
+Note that original files will remain unchanged and new markdown files are created with the same base name.
+
+</details>
+   
 ## Contributing

 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
@@ -37,6 +138,37 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
 contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

+### How to Contribute
+
+You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
+
+
+<div align="center">
+
+|                       | All                                      | Especially Needs Help from Community                                                                 |
+|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
+| **Issues**            | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
+| **PRs**               | [All PRs](https://github.com/microsoft/markitdown/pulls)     | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22)               |
+
+</div>
+
+### Running Tests and Checks
+
+- Install `hatch` in your environment and run tests:
+    ```sh
+    pip install hatch  # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
+    hatch shell
+    hatch test
+    ```
+
+  (Alternative) Use the Devcontainer which has all the dependencies installed:
+    ```sh
+    # Reopen the project in Devcontainer and run:
+    hatch test
+    ```
+
+- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
+
 ## Trademarks

 This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "markitdown"
 dynamic = ["version"]
-description = ''
+description = 'Utility tool for converting various files to Markdown'
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
@@ -16,11 +16,10 @@ authors = [
 classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python",
-  "Programming Language :: Python :: 3.8",
-  "Programming Language :: Python :: 3.9",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
  "Programming Language :: Python :: Implementation :: CPython",
  "Programming Language :: Python :: Implementation :: PyPy",
 ]
@@ -33,12 +32,18 @@ dependencies = [
  "python-pptx",
  "pandas",
  "openpyxl",
+  "xlrd",
  "pdfminer.six",
  "puremagic",
  "pydub",
+  "olefile",
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
+  "charset-normalizer",
+  "openai",
+  "azure-ai-documentintelligence",
+  "azure-identity"
 ]

 [project.urls]
@@ -77,3 +82,6 @@ exclude_lines = [
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
+
+[tool.hatch.build.targets.sdist]
+only-include = ["src/markitdown"]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a1"
+__version__ = "0.0.1a5"
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -1,21 +1,20 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
+import argparse
 import sys
-from ._markitdown import MarkItDown
+import shutil
+from textwrap import dedent
+from .__about__ import __version__
+from ._markitdown import MarkItDown, DocumentConverterResult


 def main():
-    if len(sys.argv) == 1:
-        markitdown = MarkItDown()
-        result = markitdown.convert_stream(sys.stdin.buffer)
-        print(result.text_content)
-    elif len(sys.argv) == 2:
-        markitdown = MarkItDown()
-        result = markitdown.convert(sys.argv[1])
-        print(result.text_content)
-    else:
-        sys.stderr.write(
+    parser = argparse.ArgumentParser(
+        description="Convert various file formats to markdown.",
+        prog="markitdown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage=dedent(
            """
            SYNTAX:

@@ -33,10 +32,80 @@ EXAMPLE:
                OR

                markitdown < example.pdf
-""".strip()
-            + "\n"
+                
+                OR to save to a file use
+    
+                markitdown example.pdf -o example.md
+                
+                OR
+                
+                markitdown example.pdf > example.md
+            """
+        ).strip(),
    )

+    parser.add_argument(
+        "-v",
+        "--version",
+        action="version",
+        version=f"%(prog)s {__version__}",
+        help="show the version number and exit",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Output file name. If not provided, output is written to stdout.",
+    )
+
+    parser.add_argument(
+        "-d",
+        "--use-docintel",
+        action="store_true",
+        help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
+    )
+
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        type=str,
+        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
+    )
+
+    parser.add_argument("filename", nargs="?")
+    args = parser.parse_args()
+
+    which_exiftool = shutil.which("exiftool")
+
+    if args.use_docintel:
+        if args.endpoint is None:
+            raise ValueError(
+                "Document Intelligence Endpoint is required when using Document Intelligence."
+            )
+        elif args.filename is None:
+            raise ValueError("Filename is required when using Document Intelligence.")
+        markitdown = MarkItDown(
+            exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
+        )
+    else:
+        markitdown = MarkItDown(exiftool_path=which_exiftool)
+
+    if args.filename is None:
+        result = markitdown.convert_stream(sys.stdin.buffer)
+    else:
+        result = markitdown.convert(args.filename)
+
+    _handle_output(args, result)
+
+
+def _handle_output(args, result: DocumentConverterResult):
+    """Handle output to stdout or file"""
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(result.text_content)
+    else:
+        print(result.text_content)
+

 if __name__ == "__main__":
    main()
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
--- a/src/markitdown/py.typed
+++ b/src/markitdown/py.typed
--- a/tests/test_files/test.docx
+++ b/tests/test_files/test.docx
--- a/tests/test_files/test.jpg
+++ b/tests/test_files/test.jpg
--- a/tests/test_files/test.json
+++ b/tests/test_files/test.json
@@ -0,0 +1,10 @@
+{
+    "key1": "string_value",
+    "key2": 1234,
+    "key3": [
+        "list_value1",
+        "list_value2"
+    ],
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key",
+    "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3"
+}
--- a/tests/test_files/test.pptx
+++ b/tests/test_files/test.pptx
--- a/tests/test_files/test.xls
+++ b/tests/test_files/test.xls
--- a/tests/test_files/test.xlsx
+++ b/tests/test_files/test.xlsx
--- a/tests/test_files/test_files.zip
+++ b/tests/test_files/test_files.zip
--- a/tests/test_files/test_llm.jpg
+++ b/tests/test_files/test_llm.jpg
--- a/tests/test_files/test_mskanji.csv
+++ b/tests/test_files/test_mskanji.csv
@@ -0,0 +1,4 @@
+<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
--- a/tests/test_files/test_notebook.ipynb
+++ b/tests/test_files/test_notebook.ipynb
@@ -0,0 +1,89 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "0f61db80",
+            "metadata": {},
+            "source": [
+                "# Test Notebook"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 11,
+            "id": "3f2a5bbd",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "markitdown\n"
+                    ]
+                }
+            ],
+            "source": [
+                "print('markitdown')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "9b9c0468",
+            "metadata": {},
+            "source": [
+                "## Code Cell Below"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "id": "37d8088a",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "42\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# comment in code\n",
+                "print(42)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "2e3177bd",
+            "metadata": {},
+            "source": [
+                "End\n",
+                "\n",
+                "---"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.12.8"
+        },
+        "title": "Test Notebook Title"
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/tests/test_files/test_outlook_msg.msg
+++ b/tests/test_files/test_outlook_msg.msg
--- a/tests/test_files/test_rss.xml
+++ b/tests/test_files/test_rss.xml
--- a/tests/test_files/test_with_comment.docx
+++ b/tests/test_files/test_with_comment.docx
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -6,11 +6,23 @@ import shutil
 import pytest
 import requests

+from warnings import catch_warnings, resetwarnings
+
 from markitdown import MarkItDown

 skip_remote = (
    True if os.environ.get("GITHUB_ACTIONS") else False
 )  # Don't run these tests in CI
+
+
+# Don't run the llm tests without a key and the client library
+skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
+try:
+    import openai
+except ModuleNotFoundError:
+    skip_llm = True
+
+# Skip exiftool tests if not installed
 skip_exiftool = shutil.which("exiftool") is None

 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -42,6 +54,12 @@ XLSX_TEST_STRINGS = [
    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 ]

+XLS_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
 DOCX_TEST_STRINGS = [
    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
    "49e168b7-d2ae-407f-a055-2167576f39a1",
@@ -51,12 +69,34 @@ DOCX_TEST_STRINGS = [
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 ]

+MSG_TEST_STRINGS = [
+    "# Email Message",
+    "**From:** test.sender@example.com",
+    "**To:** test.recipient@example.com",
+    "**Subject:** Test Email Message",
+    "## Content",
+    "This is the body of the test email message",
+]
+
+DOCX_COMMENT_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "This is a test comment. 12df-321a",
+    "Yet another comment in the doc. 55yiyi-asd09",
+]
+
 PPTX_TEST_STRINGS = [
    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
    "1b92870d-e3b5-4e65-8153-919f4ff45592",
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
+    "2003",  # chart value
 ]

 BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@@ -65,6 +105,13 @@ BLOG_TEST_STRINGS = [
    "an example where high cost can easily prevent a generic complex",
 ]

+
+RSS_TEST_STRINGS = [
+    "The Official Microsoft Blog",
+    "In the case of AI, it is absolutely true that the industry is moving incredibly fast",
+]
+
+
 WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
 WIKIPEDIA_TEST_STRINGS = [
    "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@@ -87,6 +134,33 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]

+CSV_CP932_TEST_STRINGS = [
+    "名前,年齢,住所",
+    "佐藤太郎,30,東京",
+    "三木英子,25,大阪",
+    "髙橋淳,35,名古屋",
+]
+
+LLM_TEST_STRINGS = [
+    "5bda1dd6",
+]
+
+JSON_TEST_STRINGS = [
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8",
+    "9700dc99-6685-40b4-9a3a-5e406dcb37f3",
+]
+
+
+# --- Helper Functions ---
+def validate_strings(result, expected_strings, exclude_strings=None):
+    """Validate presence or absence of specific strings."""
+    text_content = result.text_content.replace("\\", "")
+    for string in expected_strings:
+        assert string in text_content
+    if exclude_strings:
+        for string in exclude_strings:
+            assert string not in text_content
+

@pytest.mark.skipif(
    skip_remote,
@@ -120,67 +194,175 @@ def test_markitdown_local() -> None:

    # Test XLSX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
-    for test_string in XLSX_TEST_STRINGS:
+    validate_strings(result, XLSX_TEST_STRINGS)
+
+    # Test XLS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls"))
+    for test_string in XLS_TEST_STRINGS:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content

    # Test DOCX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
-    for test_string in DOCX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, DOCX_TEST_STRINGS)
+
+    # Test DOCX processing, with comments
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
+        style_map="comment-reference => ",
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
+
+    # Test DOCX processing, with comments and setting style_map on init
+    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
+    result = markitdown_with_style_map.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)

    # Test PPTX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
-    for test_string in PPTX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, PPTX_TEST_STRINGS)

    # Test HTML processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
    )
-    for test_string in BLOG_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, BLOG_TEST_STRINGS)
+
+    # Test ZIP file processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
+    validate_strings(result, XLSX_TEST_STRINGS)

    # Test Wikipedia processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
    )
    text_content = result.text_content.replace("\\", "")
-    for test_string in WIKIPEDIA_TEST_EXCLUDES:
-        assert test_string not in text_content
-    for test_string in WIKIPEDIA_TEST_STRINGS:
-        assert test_string in text_content
+    validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)

    # Test Bing processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
    )
    text_content = result.text_content.replace("\\", "")
-    for test_string in SERP_TEST_EXCLUDES:
-        assert test_string not in text_content
-    for test_string in SERP_TEST_STRINGS:
+    validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
+
+    # Test RSS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
+    text_content = result.text_content.replace("\\", "")
+    for test_string in RSS_TEST_STRINGS:
        assert test_string in text_content

+    ## Test non-UTF-8 encoding
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
+    validate_strings(result, CSV_CP932_TEST_STRINGS)
+
+    # Test MSG (Outlook email) processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
+    validate_strings(result, MSG_TEST_STRINGS)
+
+    # Test JSON processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
+    validate_strings(result, JSON_TEST_STRINGS)
+
+    # Test input with leading blank characters
+    input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
+    result = markitdown.convert_stream(io.BytesIO(input_data))
+    assert "# Test" in result.text_content
+

@pytest.mark.skipif(
    skip_exiftool,
    reason="do not run if exiftool is not installed",
 )
 def test_markitdown_exiftool() -> None:
+    # Test the automatic discovery of exiftool throws a warning
+    # and is disabled
+    try:
+        with catch_warnings(record=True) as w:
            markitdown = MarkItDown()
+            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert result.text_content.strip() == ""
+    finally:
+        resetwarnings()

-    # Test JPG metadata processing
+    # Test explicitly setting the location of exiftool
+    which_exiftool = shutil.which("exiftool")
+    markitdown = MarkItDown(exiftool_path=which_exiftool)
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+
+    # Test setting the exiftool path through an environment variable
+    os.environ["EXIFTOOL_PATH"] = which_exiftool
+    markitdown = MarkItDown()
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
    for key in JPG_TEST_EXIFTOOL:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
        assert target in result.text_content


+def test_markitdown_deprecation() -> None:
+    try:
+        with catch_warnings(record=True) as w:
+            test_client = object()
+            markitdown = MarkItDown(mlm_client=test_client)
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_client == test_client
+    finally:
+        resetwarnings()
+
+    try:
+        with catch_warnings(record=True) as w:
+            markitdown = MarkItDown(mlm_model="gpt-4o")
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_model == "gpt-4o"
+    finally:
+        resetwarnings()
+
+    try:
+        test_client = object()
+        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
+        assert False
+    except ValueError:
+        pass
+
+    try:
+        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
+        assert False
+    except ValueError:
+        pass
+
+
+@pytest.mark.skipif(
+    skip_llm,
+    reason="do not run llm tests without a key",
+)
+def test_markitdown_llm() -> None:
+    client = openai.OpenAI()
+    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
+
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
+
+    for test_string in LLM_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # This is not super precise. It would also accept "red square", "blue circle",
+    # "the square is not blue", etc. But it's sufficient for this test.
+    for test_string in ["red", "circle", "blue", "square"]:
+        assert test_string in result.text_content.lower()
+
+
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
-    test_markitdown_remote()
-    test_markitdown_local()
+    # test_markitdown_remote()
+    # test_markitdown_local()
    test_markitdown_exiftool()
+    # test_markitdown_deprecation()
+    # test_markitdown_llm()