small formatting change

Set exiftool path explicitly. (#267 )
Removed the holiday away message from README.md (#266 )
2025-01-14 18:04:14 -05:00 · 2025-01-06 12:43:47 -08:00 · 2025-01-06 09:06:21 -08:00 · 2025-01-03 16:40:43 -08:00 · 2025-01-03 16:03:11 -08:00 · 2025-01-03 14:34:33 -08:00
28 changed files with 1180 additions and 112 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,32 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
+{
+	"name": "Existing Dockerfile",
+	"build": {
+		// Sets the run context to one level up instead of the .devcontainer folder.
+		"context": "..",
+		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
+		"dockerfile": "../Dockerfile",
+		"args": {
+			"INSTALL_GIT": "true"
+		}
+	},
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+	"features": {
+		"ghcr.io/devcontainers-extra/features/hatch:2": {}
+	},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Uncomment the next line to run commands after the container is created.
+	// "postCreateCommand": "cat /etc/os-release",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+	"remoteUser": "root"
+}
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+*
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+tests/test_files/** linguist-vendored
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,9 +5,9 @@ jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
      - name: Set up Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
        with:
          python-version: "3.x"

--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -5,8 +5,8 @@ jobs:
  tests:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: |
            3.10
@@ -14,7 +14,7 @@ jobs:
            3.12
      - name: Set up pip cache
        if: runner.os == 'Linux'
-        uses: actions/cache@v3
+        uses: actions/cache@v4
        with:
          path: ~/.cache/pip
          key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+.vscode
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -160,3 +162,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+src/.DS_Store
+.DS_Store
--- a/22
+++ b/22
@@ -0,0 +1,22 @@
+FROM python:3.13-slim-bullseye
+
+USER root
+
+ARG INSTALL_GIT=false
+RUN if [ "$INSTALL_GIT" = "true" ]; then \
+    apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
+    fi
+
+# Runtime dependency
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install markitdown
+
+# Default USERID and GROUPID
+ARG USERID=10000
+ARG GROUPID=10000
+USER $USERID:$GROUPID
+
+ENTRYPOINT [ "markitdown" ]
--- a/README.md
+++ b/README.md
@@ -1,28 +1,111 @@
 # MarkItDown

-The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
+[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
+![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
+[![Built by AutoGen Team](https://img.shields.io/badge/Built%20by-AutoGen%20Team-blue)](https://github.com/microsoft/autogen)

-It presently supports:

- PDF (.pdf)
- PowerPoint (.pptx)
- Word (.docx)
- Excel (.xlsx)
- Images (EXIF metadata, and OCR)
- Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.)
+MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
+It supports:
+- PDF
+- PowerPoint
+- Word
+- Excel
+- Images (EXIF metadata and OCR)
+- Audio (EXIF metadata and speech transcription)
+- HTML
+- Text-based formats (CSV, JSON, XML)
+- ZIP files (iterates over contents)

-The API is simple:
+To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
+
+## Usage
+
+### Command-Line
+
+```bash
+markitdown path-to-file.pdf > document.md
+```
+
+Or use `-o` to specify the output file:
+
+```bash
+markitdown path-to-file.pdf -o document.md
+```
+
+You can also pipe content:
+
+```bash
+cat path-to-file.pdf | markitdown
+```
+
+### Python API
+
+Basic usage in Python:

 ```python
 from markitdown import MarkItDown

-markitdown = MarkItDown()
-result = markitdown.convert("test.xlsx")
+md = MarkItDown()
+result = md.convert("test.xlsx")
 print(result.text_content)
 ```

+To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
+
+```python
+from markitdown import MarkItDown
+from openai import OpenAI
+
+client = OpenAI()
+md = MarkItDown(llm_client=client, llm_model="gpt-4o")
+result = md.convert("example.jpg")
+print(result.text_content)
+```
+
+### Docker
+
+```sh
+docker build -t markitdown:latest .
+docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
+```
+<details>
+    
+<summary>Batch Processing Multiple Files</summary>
+
+This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
+
+
+```python convert.py
+from markitdown import MarkItDown
+from openai import OpenAI
+import os
+client = OpenAI(api_key="your-api-key-here")
+md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
+supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
+files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
+for file in files_to_convert:
+    print(f"\nConverting {file}...")
+    try:
+        md_file = os.path.splitext(file)[0] + '.md'
+        result = md.convert(file)
+        with open(md_file, 'w') as f:
+            f.write(result.text_content)
+        
+        print(f"Successfully converted {file} to {md_file}")
+    except Exception as e:
+        print(f"Error converting {file}: {str(e)}")
+
+print("\nAll conversions completed!")
+```
+2. Place the script in the same directory as your files
+3. Install required packages: like openai
+4. Run script ```bash python convert.py ```
+
+Note that original files will remain unchanged and new markdown files are created with the same base name.
+
+</details>
+   
 ## Contributing

 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
@@ -37,10 +120,41 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
 contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

+### How to Contribute
+
+You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
+
+
+<div align="center">
+
+|                       | All                                      | Especially Needs Help from Community                                                                 |
+|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
+| **Issues**            | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
+| **PRs**               | [All PRs](https://github.com/microsoft/markitdown/pulls)     | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22)               |
+
+</div>
+
+### Running Tests and Checks
+
+- Install `hatch` in your environment and run tests:
+    ```sh
+    pip install hatch  # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
+    hatch shell
+    hatch test
+    ```
+
+  (Alternative) Use the Devcontainer which has all the dependencies installed:
+    ```sh
+    # Reopen the project in Devcontainer and run:
+    hatch test
+    ```
+
+- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
+
 ## Trademarks

-This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
-trademarks or logos is subject to and must follow 
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
+trademarks or logos is subject to and must follow
 [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
 Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
 Any use of third-party trademarks or logos are subject to those third-party's policies.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "markitdown"
 dynamic = ["version"]
-description = ''
+description = 'Utility tool for converting various files to Markdown'
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
@@ -16,11 +16,10 @@ authors = [
 classifiers = [
  "Development Status :: 4 - Beta",
  "Programming Language :: Python",
-  "Programming Language :: Python :: 3.8",
-  "Programming Language :: Python :: 3.9",
  "Programming Language :: Python :: 3.10",
  "Programming Language :: Python :: 3.11",
  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
  "Programming Language :: Python :: Implementation :: CPython",
  "Programming Language :: Python :: Implementation :: PyPy",
 ]
@@ -33,12 +32,16 @@ dependencies = [
  "python-pptx",
  "pandas",
  "openpyxl",
+  "xlrd",
  "pdfminer.six",
  "puremagic",
  "pydub",
+  "olefile",
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
+  "charset-normalizer",
+  "openai",
 ]

 [project.urls]
@@ -77,3 +80,6 @@ exclude_lines = [
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
+
+[tool.hatch.build.targets.sdist]
+only-include = ["src/markitdown"]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a1"
+__version__ = "0.0.1a3"
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -1,41 +1,81 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
+import argparse
 import sys
-from ._markitdown import MarkItDown
+from textwrap import dedent
+from .__about__ import __version__
+from ._markitdown import MarkItDown, DocumentConverterResult


 def main():
-    if len(sys.argv) == 1:
+    parser = argparse.ArgumentParser(
+        description="Convert various file formats to markdown.",
+        prog="markitdown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage=dedent(
+            """
+            SYNTAX:
+
+                markitdown <OPTIONAL: FILENAME>
+                If FILENAME is empty, markitdown reads from stdin.
+
+            EXAMPLE:
+
+                markitdown example.pdf
+
+                OR
+
+                cat example.pdf | markitdown
+
+                OR
+
+                markitdown < example.pdf
+                
+                OR to save to a file use
+    
+                markitdown example.pdf -o example.md
+                
+                OR
+                
+                markitdown example.pdf > example.md
+            """
+        ).strip(),
+    )
+
+    parser.add_argument(
+        "-v",
+        "--version",
+        action="version",
+        version=f"%(prog)s {__version__}",
+        help="show the version number and exit",
+    )
+
+    parser.add_argument("filename", nargs="?")
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Output file name. If not provided, output is written to stdout.",
+    )
+    args = parser.parse_args()
+
+    if args.filename is None:
        markitdown = MarkItDown()
        result = markitdown.convert_stream(sys.stdin.buffer)
-        print(result.text_content)
-    elif len(sys.argv) == 2:
-        markitdown = MarkItDown()
-        result = markitdown.convert(sys.argv[1])
-        print(result.text_content)
+        _handle_output(args, result)
    else:
-        sys.stderr.write(
-            """
-SYNTAX: 
-    
-    markitdown <OPTIONAL: FILENAME>
-    If FILENAME is empty, markitdown reads from stdin.
+        markitdown = MarkItDown()
+        result = markitdown.convert(args.filename)
+        _handle_output(args, result)

-EXAMPLE:
-    
-    markitdown example.pdf
-    
-    OR

-    cat example.pdf | markitdown
-
-    OR 
-
-    markitdown < example.pdf
-""".strip()
-            + "\n"
-        )
+def _handle_output(args, result: DocumentConverterResult):
+    """Handle output to stdout or file"""
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(result.text_content)
+    else:
+        print(result.text_content)


 if __name__ == "__main__":
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -12,11 +12,16 @@ import subprocess
 import sys
 import tempfile
 import traceback
+import zipfile
+from xml.dom import minidom
 from typing import Any, Dict, List, Optional, Union
+from pathlib import Path
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
+from warnings import warn, resetwarnings, catch_warnings

 import mammoth
 import markdownify
+import olefile
 import pandas as pd
 import pdfminer
 import pdfminer.high_level
@@ -26,15 +31,25 @@ import pptx
 import puremagic
 import requests
 from bs4 import BeautifulSoup
+from charset_normalizer import from_path

 # Optional Transcription support
+IS_AUDIO_TRANSCRIPTION_CAPABLE = False
 try:
-    import pydub
+    # Using warnings' catch_warnings to catch
+    # pydub's warning of ffmpeg or avconv missing
+    with catch_warnings(record=True) as w:
+        import pydub
+
+        if w:
+            raise ModuleNotFoundError
    import speech_recognition as sr

    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
 except ModuleNotFoundError:
    pass
+finally:
+    resetwarnings()

 # Optional YouTube transcription support
 try:
@@ -158,12 +173,13 @@ class PlainTextConverter(DocumentConverter):
        # Only accept text files
        if content_type is None:
            return None
-        elif "text/" not in content_type.lower():
+        elif all(
+            not content_type.lower().startswith(type_prefix)
+            for type_prefix in ["text/", "application/json"]
+        ):
            return None

-        text_content = ""
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            text_content = fh.read()
+        text_content = str(from_path(local_path).best())
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
@@ -213,6 +229,143 @@ class HtmlConverter(DocumentConverter):
        )


+class RSSConverter(DocumentConverter):
+    """Convert RSS / Atom type to markdown"""
+
+    def convert(
+        self, local_path: str, **kwargs
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not RSS type
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".xml", ".rss", ".atom"]:
+            return None
+        try:
+            doc = minidom.parse(local_path)
+        except BaseException as _:
+            return None
+        result = None
+        if doc.getElementsByTagName("rss"):
+            # A RSS feed must have a root element of <rss>
+            result = self._parse_rss_type(doc)
+        elif doc.getElementsByTagName("feed"):
+            root = doc.getElementsByTagName("feed")[0]
+            if root.getElementsByTagName("entry"):
+                # An Atom feed must have a root element of <feed> and at least one <entry>
+                result = self._parse_atom_type(doc)
+            else:
+                return None
+        else:
+            # not rss or atom
+            return None
+
+        return result
+
+    def _parse_atom_type(
+        self, doc: minidom.Document
+    ) -> Union[None, DocumentConverterResult]:
+        """Parse the type of an Atom feed.
+
+        Returns None if the feed type is not recognized or something goes wrong.
+        """
+        try:
+            root = doc.getElementsByTagName("feed")[0]
+            title = self._get_data_by_tag_name(root, "title")
+            subtitle = self._get_data_by_tag_name(root, "subtitle")
+            entries = root.getElementsByTagName("entry")
+            md_text = f"# {title}\n"
+            if subtitle:
+                md_text += f"{subtitle}\n"
+            for entry in entries:
+                entry_title = self._get_data_by_tag_name(entry, "title")
+                entry_summary = self._get_data_by_tag_name(entry, "summary")
+                entry_updated = self._get_data_by_tag_name(entry, "updated")
+                entry_content = self._get_data_by_tag_name(entry, "content")
+
+                if entry_title:
+                    md_text += f"\n## {entry_title}\n"
+                if entry_updated:
+                    md_text += f"Updated on: {entry_updated}\n"
+                if entry_summary:
+                    md_text += self._parse_content(entry_summary)
+                if entry_content:
+                    md_text += self._parse_content(entry_content)
+
+            return DocumentConverterResult(
+                title=title,
+                text_content=md_text,
+            )
+        except BaseException as _:
+            return None
+
+    def _parse_rss_type(
+        self, doc: minidom.Document
+    ) -> Union[None, DocumentConverterResult]:
+        """Parse the type of an RSS feed.
+
+        Returns None if the feed type is not recognized or something goes wrong.
+        """
+        try:
+            root = doc.getElementsByTagName("rss")[0]
+            channel = root.getElementsByTagName("channel")
+            if not channel:
+                return None
+            channel = channel[0]
+            channel_title = self._get_data_by_tag_name(channel, "title")
+            channel_description = self._get_data_by_tag_name(channel, "description")
+            items = channel.getElementsByTagName("item")
+            if channel_title:
+                md_text = f"# {channel_title}\n"
+            if channel_description:
+                md_text += f"{channel_description}\n"
+            if not items:
+                items = []
+            for item in items:
+                title = self._get_data_by_tag_name(item, "title")
+                description = self._get_data_by_tag_name(item, "description")
+                pubDate = self._get_data_by_tag_name(item, "pubDate")
+                content = self._get_data_by_tag_name(item, "content:encoded")
+
+                if title:
+                    md_text += f"\n## {title}\n"
+                if pubDate:
+                    md_text += f"Published on: {pubDate}\n"
+                if description:
+                    md_text += self._parse_content(description)
+                if content:
+                    md_text += self._parse_content(content)
+
+            return DocumentConverterResult(
+                title=channel_title,
+                text_content=md_text,
+            )
+        except BaseException as _:
+            print(traceback.format_exc())
+            return None
+
+    def _parse_content(self, content: str) -> str:
+        """Parse the content of an RSS feed item"""
+        try:
+            # using bs4 because many RSS feeds have HTML-styled content
+            soup = BeautifulSoup(content, "html.parser")
+            return _CustomMarkdownify().convert_soup(soup)
+        except BaseException as _:
+            return content
+
+    def _get_data_by_tag_name(
+        self, element: minidom.Element, tag_name: str
+    ) -> Union[str, None]:
+        """Get data from first child element with the given tag name.
+        Returns None when no such element is found.
+        """
+        nodes = element.getElementsByTagName(tag_name)
+        if not nodes:
+            return None
+        fc = nodes[0].firstChild
+        if fc:
+            return fc.data
+        return None
+
+
 class WikipediaConverter(DocumentConverter):
    """Handle Wikipedia pages separately, focusing only on the main document content."""

@@ -344,8 +497,11 @@ class YouTubeConverter(DocumentConverter):
                assert isinstance(params["v"][0], str)
                video_id = str(params["v"][0])
                try:
+                    youtube_transcript_languages = kwargs.get(
+                        "youtube_transcript_languages", ("en",)
+                    )
                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id)  # type: ignore
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
@@ -391,6 +547,67 @@ class YouTubeConverter(DocumentConverter):
        return None


+class IpynbConverter(DocumentConverter):
+    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not ipynb
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".ipynb":
+            return None
+
+        # Parse and convert the notebook
+        result = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            notebook_content = json.load(fh)
+            result = self._convert(notebook_content)
+
+        return result
+
+    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
+        """Helper function that converts notebook JSON content to Markdown."""
+        try:
+            md_output = []
+            title = None
+
+            for cell in notebook_content.get("cells", []):
+                cell_type = cell.get("cell_type", "")
+                source_lines = cell.get("source", [])
+
+                if cell_type == "markdown":
+                    md_output.append("".join(source_lines))
+
+                    # Extract the first # heading as title if not already found
+                    if title is None:
+                        for line in source_lines:
+                            if line.startswith("# "):
+                                title = line.lstrip("# ").strip()
+                                break
+
+                elif cell_type == "code":
+                    # Code cells are wrapped in Markdown code blocks
+                    md_output.append(f"```python\n{''.join(source_lines)}\n```")
+                elif cell_type == "raw":
+                    md_output.append(f"```\n{''.join(source_lines)}\n```")
+
+            md_text = "\n\n".join(md_output)
+
+            # Check for title in notebook metadata
+            title = notebook_content.get("metadata", {}).get("title", title)
+
+            return DocumentConverterResult(
+                title=title,
+                text_content=md_text,
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Error converting .ipynb file: {str(e)}"
+            ) from e
+
+
 class BingSerpConverter(DocumentConverter):
    """
    Handle Bing results pages (only the organic search results).
@@ -492,7 +709,9 @@ class DocxConverter(HtmlConverter):

        result = None
        with open(local_path, "rb") as docx_file:
-            result = mammoth.convert_to_html(docx_file)
+            style_map = kwargs.get("style_map", None)
+
+            result = mammoth.convert_to_html(docx_file, style_map=style_map)
            html_content = result.value
            result = self._convert(html_content)

@@ -510,7 +729,31 @@ class XlsxConverter(HtmlConverter):
        if extension.lower() != ".xlsx":
            return None

-        sheets = pd.read_excel(local_path, sheet_name=None)
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        md_content = ""
+        for s in sheets:
+            md_content += f"## {s}\n"
+            html_content = sheets[s].to_html(index=False)
+            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+
+class XlsConverter(HtmlConverter):
+    """
+    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLS
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".xls":
+            return None
+
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
        md_content = ""
        for s in sheets:
            md_content += f"## {s}\n"
@@ -582,6 +825,10 @@ class PptxConverter(HtmlConverter):
                        "\n" + self._convert(html_table).text_content.strip() + "\n"
                    )

+                # Charts
+                if shape.has_chart:
+                    md_content += self._convert_chart_to_markdown(shape.chart)
+
                # Text areas
                elif shape.has_text_frame:
                    if shape == title:
@@ -616,20 +863,54 @@ class PptxConverter(HtmlConverter):
            return True
        return False

+    def _convert_chart_to_markdown(self, chart):
+        md = "\n\n### Chart"
+        if chart.has_title:
+            md += f": {chart.chart_title.text_frame.text}"
+        md += "\n\n"
+        data = []
+        category_names = [c.label for c in chart.plots[0].categories]
+        series_names = [s.name for s in chart.series]
+        data.append(["Category"] + series_names)
+
+        for idx, category in enumerate(category_names):
+            row = [category]
+            for series in chart.series:
+                row.append(series.values[idx])
+            data.append(row)
+
+        markdown_table = []
+        for row in data:
+            markdown_table.append("| " + " | ".join(map(str, row)) + " |")
+        header = markdown_table[0]
+        separator = "|" + "|".join(["---"] * len(data[0])) + "|"
+        return md + "\n".join([header, separator] + markdown_table[1:])
+

 class MediaConverter(DocumentConverter):
    """
    Abstract class for multi-modal media (e.g., images and audio)
    """

-    def _get_metadata(self, local_path):
-        exiftool = shutil.which("exiftool")
-        if not exiftool:
+    def _get_metadata(self, local_path, exiftool_path=None):
+        if not exiftool_path:
+            which_exiftool = shutil.which("exiftool")
+            if which_exiftool:
+                warn(
+                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
+
+    md = MarkItDown(exiftool_path="{which_exiftool}")
+
+This warning will be removed in future releases.
+""",
+                    DeprecationWarning,
+                )
+
            return None
        else:
            try:
                result = subprocess.run(
-                    [exiftool, "-json", local_path], capture_output=True, text=True
+                    [exiftool_path, "-json", local_path], capture_output=True, text=True
                ).stdout
                return json.loads(result)[0]
            except Exception:
@@ -642,7 +923,7 @@ class WavConverter(MediaConverter):
    """

    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLSX
+        # Bail if not a WAV
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".wav":
            return None
@@ -650,7 +931,7 @@ class WavConverter(MediaConverter):
        md_content = ""

        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "Title",
@@ -705,7 +986,7 @@ class Mp3Converter(WavConverter):
        md_content = ""

        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "Title",
@@ -754,11 +1035,11 @@ class Mp3Converter(WavConverter):

 class ImageConverter(MediaConverter):
    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """

    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a XLSX
+        # Bail if not an image
        extension = kwargs.get("file_extension", "")
        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
            return None
@@ -766,7 +1047,7 @@ class ImageConverter(MediaConverter):
        md_content = ""

        # Add metadata
-        metadata = self._get_metadata(local_path)
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
        if metadata:
            for f in [
                "ImageSize",
@@ -784,17 +1065,17 @@ class ImageConverter(MediaConverter):
                    md_content += f"{f}: {metadata[f]}\n"

        # Try describing the image with GPTV
-        mlm_client = kwargs.get("mlm_client")
-        mlm_model = kwargs.get("mlm_model")
-        if mlm_client is not None and mlm_model is not None:
+        llm_client = kwargs.get("llm_client")
+        llm_model = kwargs.get("llm_model")
+        if llm_client is not None and llm_model is not None:
            md_content += (
                "\n# Description:\n"
-                + self._get_mlm_description(
+                + self._get_llm_description(
                    local_path,
                    extension,
-                    mlm_client,
-                    mlm_model,
-                    prompt=kwargs.get("mlm_prompt"),
+                    llm_client,
+                    llm_model,
+                    prompt=kwargs.get("llm_prompt"),
                ).strip()
                + "\n"
            )
@@ -804,12 +1085,10 @@ class ImageConverter(MediaConverter):
            text_content=md_content,
        )

-    def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."

-        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
-
        data_uri = ""
        with open(local_path, "rb") as image_file:
            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
@@ -837,6 +1116,208 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content


+class OutlookMsgConverter(DocumentConverter):
+    """Converts Outlook .msg files to markdown by extracting email metadata and content.
+
+    Uses the olefile package to parse the .msg file structure and extract:
+    - Email headers (From, To, Subject)
+    - Email body content
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a MSG file
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".msg":
+            return None
+
+        try:
+            msg = olefile.OleFileIO(local_path)
+            # Extract email metadata
+            md_content = "# Email Message\n\n"
+
+            # Get headers
+            headers = {
+                "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
+                "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
+                "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
+            }
+
+            # Add headers to markdown
+            for key, value in headers.items():
+                if value:
+                    md_content += f"**{key}:** {value}\n"
+
+            md_content += "\n## Content\n\n"
+
+            # Get email body
+            body = self._get_stream_data(msg, "__substg1.0_1000001F")
+            if body:
+                md_content += body
+
+            msg.close()
+
+            return DocumentConverterResult(
+                title=headers.get("Subject"), text_content=md_content.strip()
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Could not convert MSG file '{local_path}': {str(e)}"
+            )
+
+    def _get_stream_data(
+        self, msg: olefile.OleFileIO, stream_path: str
+    ) -> Union[str, None]:
+        """Helper to safely extract and decode stream data from the MSG file."""
+        try:
+            if msg.exists(stream_path):
+                data = msg.openstream(stream_path).read()
+                # Try UTF-16 first (common for .msg files)
+                try:
+                    return data.decode("utf-16-le").strip()
+                except UnicodeDecodeError:
+                    # Fall back to UTF-8
+                    try:
+                        return data.decode("utf-8").strip()
+                    except UnicodeDecodeError:
+                        # Last resort - ignore errors
+                        return data.decode("utf-8", errors="ignore").strip()
+        except Exception:
+            pass
+        return None
+
+
+class ZipConverter(DocumentConverter):
+    """Converts ZIP files to markdown by extracting and converting all contained files.
+
+    The converter extracts the ZIP contents to a temporary directory, processes each file
+    using appropriate converters based on file extensions, and then combines the results
+    into a single markdown document. The temporary directory is cleaned up after processing.
+
+    Example output format:
+    ```markdown
+    Content from the zip file `example.zip`:
+
+    ## File: docs/readme.txt
+
+    This is the content of readme.txt
+    Multiple lines are preserved
+
+    ## File: images/example.jpg
+
+    ImageSize: 1920x1080
+    DateTimeOriginal: 2024-02-15 14:30:00
+    Description: A beautiful landscape photo
+
+    ## File: data/report.xlsx
+
+    ## Sheet1
+    | Column1 | Column2 | Column3 |
+    |---------|---------|---------|
+    | data1   | data2   | data3   |
+    | data4   | data5   | data6   |
+    ```
+
+    Key features:
+    - Maintains original file structure in headings
+    - Processes nested files recursively
+    - Uses appropriate converters for each file type
+    - Preserves formatting of converted content
+    - Cleans up temporary files after processing
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a ZIP
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".zip":
+            return None
+
+        # Get parent converters list if available
+        parent_converters = kwargs.get("_parent_converters", [])
+        if not parent_converters:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
+            )
+
+        extracted_zip_folder_name = (
+            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
+        )
+        extraction_dir = os.path.normpath(
+            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
+        )
+        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
+
+        try:
+            # Extract the zip file safely
+            with zipfile.ZipFile(local_path, "r") as zipObj:
+                # Safeguard against path traversal
+                for member in zipObj.namelist():
+                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
+                    if (
+                        not os.path.commonprefix([extraction_dir, member_path])
+                        == extraction_dir
+                    ):
+                        raise ValueError(
+                            f"Path traversal detected in zip file: {member}"
+                        )
+
+                # Extract all files safely
+                zipObj.extractall(path=extraction_dir)
+
+            # Process each extracted file
+            for root, dirs, files in os.walk(extraction_dir):
+                for name in files:
+                    file_path = os.path.join(root, name)
+                    relative_path = os.path.relpath(file_path, extraction_dir)
+
+                    # Get file extension
+                    _, file_extension = os.path.splitext(name)
+
+                    # Update kwargs for the file
+                    file_kwargs = kwargs.copy()
+                    file_kwargs["file_extension"] = file_extension
+                    file_kwargs["_parent_converters"] = parent_converters
+
+                    # Try converting the file using available converters
+                    for converter in parent_converters:
+                        # Skip the zip converter to avoid infinite recursion
+                        if isinstance(converter, ZipConverter):
+                            continue
+
+                        result = converter.convert(file_path, **file_kwargs)
+                        if result is not None:
+                            md_content += f"\n## File: {relative_path}\n\n"
+                            md_content += result.text_content + "\n\n"
+                            break
+
+            # Clean up extracted files if specified
+            if kwargs.get("cleanup_extracted", True):
+                shutil.rmtree(extraction_dir)
+
+            return DocumentConverterResult(title=None, text_content=md_content.strip())
+
+        except zipfile.BadZipFile:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
+            )
+        except ValueError as ve:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
+            )
+        except Exception as e:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
+            )
+
+
 class FileConversionException(BaseException):
    pass

@@ -852,16 +1333,55 @@ class MarkItDown:
    def __init__(
        self,
        requests_session: Optional[requests.Session] = None,
+        llm_client: Optional[Any] = None,
+        llm_model: Optional[str] = None,
+        style_map: Optional[str] = None,
+        exiftool_path: Optional[str] = None,
+        # Deprecated
        mlm_client: Optional[Any] = None,
-        mlm_model: Optional[Any] = None,
+        mlm_model: Optional[str] = None,
    ):
        if requests_session is None:
            self._requests_session = requests.Session()
        else:
            self._requests_session = requests_session

-        self._mlm_client = mlm_client
-        self._mlm_model = mlm_model
+        if exiftool_path is None:
+            exiftool_path = os.environ.get("EXIFTOOL_PATH")
+
+        # Handle deprecation notices
+        #############################
+        if mlm_client is not None:
+            if llm_client is None:
+                warn(
+                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
+                    DeprecationWarning,
+                )
+                llm_client = mlm_client
+                mlm_client = None
+            else:
+                raise ValueError(
+                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
+                )
+
+        if mlm_model is not None:
+            if llm_model is None:
+                warn(
+                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
+                    DeprecationWarning,
+                )
+                llm_model = mlm_model
+                mlm_model = None
+            else:
+                raise ValueError(
+                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
+                )
+        #############################
+
+        self._llm_client = llm_client
+        self._llm_model = llm_model
+        self._style_map = style_map
+        self._exiftool_path = exiftool_path

        self._page_converters: List[DocumentConverter] = []

@@ -870,23 +1390,28 @@ class MarkItDown:
        # To this end, the most specific converters should appear below the most generic converters
        self.register_page_converter(PlainTextConverter())
        self.register_page_converter(HtmlConverter())
+        self.register_page_converter(RSSConverter())
        self.register_page_converter(WikipediaConverter())
        self.register_page_converter(YouTubeConverter())
        self.register_page_converter(BingSerpConverter())
        self.register_page_converter(DocxConverter())
        self.register_page_converter(XlsxConverter())
+        self.register_page_converter(XlsConverter())
        self.register_page_converter(PptxConverter())
        self.register_page_converter(WavConverter())
        self.register_page_converter(Mp3Converter())
        self.register_page_converter(ImageConverter())
+        self.register_page_converter(IpynbConverter())
        self.register_page_converter(PdfConverter())
+        self.register_page_converter(ZipConverter())
+        self.register_page_converter(OutlookMsgConverter())

    def convert(
-        self, source: Union[str, requests.Response], **kwargs: Any
+        self, source: Union[str, requests.Response, Path], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
        """
        Args:
-            - source: can be a string representing a path or url, or a requests.response object
+            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
        """

@@ -903,10 +1428,14 @@ class MarkItDown:
        # Request response
        elif isinstance(source, requests.Response):
            return self.convert_response(source, **kwargs)
+        elif isinstance(source, Path):
+            return self.convert_local(source, **kwargs)

    def convert_local(
-        self, path: str, **kwargs: Any
+        self, path: Union[str, Path], **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        if isinstance(path, Path):
+            path = str(path)
        # Prepare a list of extensions to try (in order of priority)
        ext = kwargs.get("file_extension")
        extensions = [ext] if ext is not None else []
@@ -1003,7 +1532,7 @@ class MarkItDown:
                self._append_ext(extensions, g)

            # Convert
-            result = self._convert(temp_path, extensions, url=response.url)
+            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
            try:
@@ -1030,11 +1559,20 @@ class MarkItDown:
                    _kwargs.update({"file_extension": ext})

                # Copy any additional global options
-                if "mlm_client" not in _kwargs and self._mlm_client is not None:
-                    _kwargs["mlm_client"] = self._mlm_client
+                if "llm_client" not in _kwargs and self._llm_client is not None:
+                    _kwargs["llm_client"] = self._llm_client

-                if "mlm_model" not in _kwargs and self._mlm_model is not None:
-                    _kwargs["mlm_model"] = self._mlm_model
+                if "llm_model" not in _kwargs and self._llm_model is not None:
+                    _kwargs["llm_model"] = self._llm_model
+
+                if "style_map" not in _kwargs and self._style_map is not None:
+                    _kwargs["style_map"] = self._style_map
+
+                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
+                    _kwargs["exiftool_path"] = self._exiftool_path
+
+                # Add the list of converters for nested processing
+                _kwargs["_parent_converters"] = self._page_converters

                # If we hit an error log it and keep trying
                try:
@@ -1071,14 +1609,32 @@ class MarkItDown:
        if ext == "":
            return
        # if ext not in extensions:
-        if True:
-            extensions.append(ext)
+        extensions.append(ext)

    def _guess_ext_magic(self, path):
        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
        # Use puremagic to guess
        try:
            guesses = puremagic.magic_file(path)
+
+            # Fix for: https://github.com/microsoft/markitdown/issues/222
+            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
+            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
+            # (space, tab, newline, carriage return, vertical tab, form feed).
+            if len(guesses) == 0:
+                with open(path, "rb") as file:
+                    while True:
+                        char = file.read(1)
+                        if not char:  # End of file
+                            break
+                        if not char.isspace():
+                            file.seek(file.tell() - 1)
+                            break
+                    try:
+                        guesses = puremagic.magic_stream(file)
+                    except puremagic.main.PureError:
+                        pass
+
            extensions = list()
            for g in guesses:
                ext = g.extension.strip()
--- a/src/markitdown/py.typed
+++ b/src/markitdown/py.typed
--- a/tests/test_files/test.docx
+++ b/tests/test_files/test.docx
--- a/tests/test_files/test.jpg
+++ b/tests/test_files/test.jpg
--- a/tests/test_files/test.json
+++ b/tests/test_files/test.json
@@ -0,0 +1,10 @@
+{
+    "key1": "string_value",
+    "key2": 1234,
+    "key3": [
+        "list_value1",
+        "list_value2"
+    ],
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key",
+    "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3"
+}
--- a/tests/test_files/test.pptx
+++ b/tests/test_files/test.pptx
--- a/tests/test_files/test.xls
+++ b/tests/test_files/test.xls
--- a/tests/test_files/test.xlsx
+++ b/tests/test_files/test.xlsx
--- a/tests/test_files/test_files.zip
+++ b/tests/test_files/test_files.zip
--- a/tests/test_files/test_llm.jpg
+++ b/tests/test_files/test_llm.jpg
--- a/tests/test_files/test_mskanji.csv
+++ b/tests/test_files/test_mskanji.csv
@@ -0,0 +1,4 @@
+<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
--- a/tests/test_files/test_notebook.ipynb
+++ b/tests/test_files/test_notebook.ipynb
@@ -0,0 +1,89 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "0f61db80",
+            "metadata": {},
+            "source": [
+                "# Test Notebook"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 11,
+            "id": "3f2a5bbd",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "markitdown\n"
+                    ]
+                }
+            ],
+            "source": [
+                "print('markitdown')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "9b9c0468",
+            "metadata": {},
+            "source": [
+                "## Code Cell Below"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "id": "37d8088a",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "42\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# comment in code\n",
+                "print(42)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "2e3177bd",
+            "metadata": {},
+            "source": [
+                "End\n",
+                "\n",
+                "---"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.12.8"
+        },
+        "title": "Test Notebook Title"
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/tests/test_files/test_outlook_msg.msg
+++ b/tests/test_files/test_outlook_msg.msg
--- a/tests/test_files/test_rss.xml
+++ b/tests/test_files/test_rss.xml
--- a/tests/test_files/test_with_comment.docx
+++ b/tests/test_files/test_with_comment.docx
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -6,11 +6,23 @@ import shutil
 import pytest
 import requests

+from warnings import catch_warnings, resetwarnings
+
 from markitdown import MarkItDown

 skip_remote = (
    True if os.environ.get("GITHUB_ACTIONS") else False
 )  # Don't run these tests in CI
+
+
+# Don't run the llm tests without a key and the client library
+skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
+try:
+    import openai
+except ModuleNotFoundError:
+    skip_llm = True
+
+# Skip exiftool tests if not installed
 skip_exiftool = shutil.which("exiftool") is None

 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -42,6 +54,12 @@ XLSX_TEST_STRINGS = [
    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
 ]

+XLS_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
 DOCX_TEST_STRINGS = [
    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
    "49e168b7-d2ae-407f-a055-2167576f39a1",
@@ -51,12 +69,34 @@ DOCX_TEST_STRINGS = [
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 ]

+MSG_TEST_STRINGS = [
+    "# Email Message",
+    "**From:** test.sender@example.com",
+    "**To:** test.recipient@example.com",
+    "**Subject:** Test Email Message",
+    "## Content",
+    "This is the body of the test email message",
+]
+
+DOCX_COMMENT_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "This is a test comment. 12df-321a",
+    "Yet another comment in the doc. 55yiyi-asd09",
+]
+
 PPTX_TEST_STRINGS = [
    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
    "1b92870d-e3b5-4e65-8153-919f4ff45592",
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
+    "2003",  # chart value
 ]

 BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@@ -65,6 +105,13 @@ BLOG_TEST_STRINGS = [
    "an example where high cost can easily prevent a generic complex",
 ]

+
+RSS_TEST_STRINGS = [
+    "The Official Microsoft Blog",
+    "In the case of AI, it is absolutely true that the industry is moving incredibly fast",
+]
+
+
 WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
 WIKIPEDIA_TEST_STRINGS = [
    "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@@ -87,6 +134,33 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]

+CSV_CP932_TEST_STRINGS = [
+    "名前,年齢,住所",
+    "佐藤太郎,30,東京",
+    "三木英子,25,大阪",
+    "髙橋淳,35,名古屋",
+]
+
+LLM_TEST_STRINGS = [
+    "5bda1dd6",
+]
+
+JSON_TEST_STRINGS = [
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8",
+    "9700dc99-6685-40b4-9a3a-5e406dcb37f3",
+]
+
+
+# --- Helper Functions ---
+def validate_strings(result, expected_strings, exclude_strings=None):
+    """Validate presence or absence of specific strings."""
+    text_content = result.text_content.replace("\\", "")
+    for string in expected_strings:
+        assert string in text_content
+    if exclude_strings:
+        for string in exclude_strings:
+            assert string not in text_content
+

@pytest.mark.skipif(
    skip_remote,
@@ -120,67 +194,175 @@ def test_markitdown_local() -> None:

    # Test XLSX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
-    for test_string in XLSX_TEST_STRINGS:
+    validate_strings(result, XLSX_TEST_STRINGS)
+
+    # Test XLS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls"))
+    for test_string in XLS_TEST_STRINGS:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content

    # Test DOCX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
-    for test_string in DOCX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, DOCX_TEST_STRINGS)
+
+    # Test DOCX processing, with comments
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
+        style_map="comment-reference => ",
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
+
+    # Test DOCX processing, with comments and setting style_map on init
+    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
+    result = markitdown_with_style_map.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)

    # Test PPTX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
-    for test_string in PPTX_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, PPTX_TEST_STRINGS)

    # Test HTML processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
    )
-    for test_string in BLOG_TEST_STRINGS:
-        text_content = result.text_content.replace("\\", "")
-        assert test_string in text_content
+    validate_strings(result, BLOG_TEST_STRINGS)
+
+    # Test ZIP file processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
+    validate_strings(result, XLSX_TEST_STRINGS)

    # Test Wikipedia processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
    )
    text_content = result.text_content.replace("\\", "")
-    for test_string in WIKIPEDIA_TEST_EXCLUDES:
-        assert test_string not in text_content
-    for test_string in WIKIPEDIA_TEST_STRINGS:
-        assert test_string in text_content
+    validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)

    # Test Bing processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
    )
    text_content = result.text_content.replace("\\", "")
-    for test_string in SERP_TEST_EXCLUDES:
-        assert test_string not in text_content
-    for test_string in SERP_TEST_STRINGS:
+    validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
+
+    # Test RSS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
+    text_content = result.text_content.replace("\\", "")
+    for test_string in RSS_TEST_STRINGS:
        assert test_string in text_content

+    ## Test non-UTF-8 encoding
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
+    validate_strings(result, CSV_CP932_TEST_STRINGS)
+
+    # Test MSG (Outlook email) processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
+    validate_strings(result, MSG_TEST_STRINGS)
+
+    # Test JSON processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
+    validate_strings(result, JSON_TEST_STRINGS)
+
+    # Test input with leading blank characters
+    input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
+    result = markitdown.convert_stream(io.BytesIO(input_data))
+    assert "# Test" in result.text_content
+

@pytest.mark.skipif(
    skip_exiftool,
    reason="do not run if exiftool is not installed",
 )
 def test_markitdown_exiftool() -> None:
-    markitdown = MarkItDown()
+    # Test the automatic discovery of exiftool throws a warning
+    # and is disabled
+    try:
+        with catch_warnings(record=True) as w:
+            markitdown = MarkItDown()
+            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert result.text_content.strip() == ""
+    finally:
+        resetwarnings()

-    # Test JPG metadata processing
+    # Test explicitly setting the location of exiftool
+    which_exiftool = shutil.which("exiftool")
+    markitdown = MarkItDown(exiftool_path=which_exiftool)
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+
+    # Test setting the exiftool path through an environment variable
+    os.environ["EXIFTOOL_PATH"] = which_exiftool
+    markitdown = MarkItDown()
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
    for key in JPG_TEST_EXIFTOOL:
        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
        assert target in result.text_content


+def test_markitdown_deprecation() -> None:
+    try:
+        with catch_warnings(record=True) as w:
+            test_client = object()
+            markitdown = MarkItDown(mlm_client=test_client)
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_client == test_client
+    finally:
+        resetwarnings()
+
+    try:
+        with catch_warnings(record=True) as w:
+            markitdown = MarkItDown(mlm_model="gpt-4o")
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_model == "gpt-4o"
+    finally:
+        resetwarnings()
+
+    try:
+        test_client = object()
+        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
+        assert False
+    except ValueError:
+        pass
+
+    try:
+        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
+        assert False
+    except ValueError:
+        pass
+
+
+@pytest.mark.skipif(
+    skip_llm,
+    reason="do not run llm tests without a key",
+)
+def test_markitdown_llm() -> None:
+    client = openai.OpenAI()
+    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
+
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
+
+    for test_string in LLM_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # This is not super precise. It would also accept "red square", "blue circle",
+    # "the square is not blue", etc. But it's sufficient for this test.
+    for test_string in ["red", "circle", "blue", "square"]:
+        assert test_string in result.text_content.lower()
+
+
 if __name__ == "__main__":
    """Runs this file's tests from the command line."""
-    test_markitdown_remote()
-    test_markitdown_local()
+    # test_markitdown_remote()
+    # test_markitdown_local()
    test_markitdown_exiftool()
+    # test_markitdown_deprecation()
+    # test_markitdown_llm()