Merge pull request #102 from microsoft/bump_version

Bump version.
2024-12-17 13:55:12 -08:00 · 2024-12-17 13:51:13 -08:00 · 2024-12-17 13:49:44 -08:00 · 2024-12-17 13:46:26 -08:00 · 2024-12-17 13:36:36 -08:00 · 2024-12-17 13:22:48 -08:00
17 changed files with 457 additions and 207 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+*
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+tests/test_files/** linguist-vendored
--- a/16
+++ b/16
@@ -0,0 +1,16 @@
+FROM python:3.13-alpine
+
+USER root
+
+# Runtime dependency
+RUN apk add --no-cache ffmpeg
+
+RUN pip install markitdown
+
+# Default USERID and GROUPID
+ARG USERID=10000
+ARG GROUPID=10000
+
+USER $USERID:$GROUPID
+
+ENTRYPOINT [ "markitdown" ]
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # MarkItDown

+[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
+
 The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)

 It presently supports:
@@ -12,7 +14,23 @@ It presently supports:
 - Audio (EXIF metadata, and speech transcription)
 - HTML (special handling of Wikipedia, etc.)
 - Various other text-based formats (csv, json, xml, etc.)
+- ZIP (Iterates over contents and converts each file)

+# Installation
+
+You can install `markitdown` using pip:
+
+```python
+pip install markitdown
+```
+
+or from the source
+
+```sh
+pip install -e .
+```
+
+# Usage
 The API is simple:

 ```python
@@ -23,6 +41,44 @@ result = markitdown.convert("test.xlsx")
 print(result.text_content)
 ```

+To use this as a command-line utility, install it and then run it like this:
+
+```bash
+markitdown path-to-file.pdf
+```
+
+This will output Markdown to standard output. You can save it like this:
+
+```bash
+markitdown path-to-file.pdf > document.md
+```
+
+You can pipe content to standard input by omitting the argument:
+
+```bash
+cat path-to-file.pdf | markitdown
+```
+
+You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client.
+
+
+```python
+from markitdown import MarkItDown
+from openai import OpenAI
+
+client = OpenAI()
+md = MarkItDown(llm_client=client, llm_model="gpt-4o")
+result = md.convert("example.jpg")
+print(result.text_content)
+```
+
+You can also use the project as Docker Image:
+
+```sh
+docker build -t markitdown:latest .
+docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
+```
+
 ## Contributing

 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
@@ -37,6 +93,24 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
 contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

+### Running Tests
+
+To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install).
+
+```sh
+pip install hatch
+hatch shell
+hatch test
+```
+
+### Running Pre-commit Checks
+
+Please run the pre-commit checks before submitting a PR.
+
+```sh
+pre-commit run --all-files
+```
+
 ## Trademarks

 This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,8 @@ dependencies = [
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
-  "pygithub"
+  "charset-normalizer",
+  "openai",
 ]

 [project.urls]
@@ -77,3 +78,6 @@ exclude_lines = [
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
+
+[tool.hatch.build.targets.sdist]
+only-include = ["src/markitdown"]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a1"
+__version__ = "0.0.1a3"
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -2,21 +2,15 @@
 #
 # SPDX-License-Identifier: MIT
 import sys
+import argparse
 from ._markitdown import MarkItDown


 def main():
-    if len(sys.argv) == 1:
-        markitdown = MarkItDown()
-        result = markitdown.convert_stream(sys.stdin.buffer)
-        print(result.text_content)
-    elif len(sys.argv) == 2:
-        markitdown = MarkItDown()
-        result = markitdown.convert(sys.argv[1])
-        print(result.text_content)
-    else:
-        sys.stderr.write(
-            """
+    parser = argparse.ArgumentParser(
+        description="Convert various file formats to markdown.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage="""
 SYNTAX: 
    
    markitdown <OPTIONAL: FILENAME>
@@ -33,9 +27,20 @@ EXAMPLE:
    OR 

    markitdown < example.pdf
-""".strip()
-            + "\n"
-        )
+""".strip(),
+    )
+
+    parser.add_argument("filename", nargs="?")
+    args = parser.parse_args()
+
+    if args.filename is None:
+        markitdown = MarkItDown()
+        result = markitdown.convert_stream(sys.stdin.buffer)
+        print(result.text_content)
+    else:
+        markitdown = MarkItDown()
+        result = markitdown.convert(args.filename)
+        print(result.text_content)


 if __name__ == "__main__":
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -12,8 +12,10 @@ import subprocess
 import sys
 import tempfile
 import traceback
+import zipfile
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
+from warnings import warn, resetwarnings, catch_warnings

 import mammoth
 import markdownify
@@ -26,15 +28,24 @@ import pptx
 import puremagic
 import requests
 from bs4 import BeautifulSoup
+from charset_normalizer import from_path

 # Optional Transcription support
 try:
-    import pydub
+    # Using warnings' catch_warnings to catch
+    # pydub's warning of ffmpeg or avconv missing
+    with catch_warnings(record=True) as w:
+        import pydub
+
+        if w:
+            raise ModuleNotFoundError
    import speech_recognition as sr

    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
 except ModuleNotFoundError:
    pass
+finally:
+    resetwarnings()

 # Optional YouTube transcription support
 try:
@@ -44,14 +55,6 @@ try:
 except ModuleNotFoundError:
    pass

-# Optional GitHub issue support
-try:
-    from github import Github
-
-    IS_GITHUB_ISSUE_CAPABLE = True
-except ModuleNotFoundError:
-    IS_GITHUB_ISSUE_CAPABLE = False
-

 class _CustomMarkdownify(markdownify.MarkdownConverter):
    """
@@ -169,9 +172,7 @@ class PlainTextConverter(DocumentConverter):
        elif "text/" not in content_type.lower():
            return None

-        text_content = ""
-        with open(local_path, "rt", encoding="utf-8") as fh:
-            text_content = fh.read()
+        text_content = str(from_path(local_path).best())
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
@@ -352,8 +353,11 @@ class YouTubeConverter(DocumentConverter):
                assert isinstance(params["v"][0], str)
                video_id = str(params["v"][0])
                try:
+                    youtube_transcript_languages = kwargs.get(
+                        "youtube_transcript_languages", ("en",)
+                    )
                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id)  # type: ignore
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
@@ -500,7 +504,9 @@ class DocxConverter(HtmlConverter):

        result = None
        with open(local_path, "rb") as docx_file:
-            result = mammoth.convert_to_html(docx_file)
+            style_map = kwargs.get("style_map", None)
+
+            result = mammoth.convert_to_html(docx_file, style_map=style_map)
            html_content = result.value
            result = self._convert(html_content)

@@ -590,6 +596,10 @@ class PptxConverter(HtmlConverter):
                        "\n" + self._convert(html_table).text_content.strip() + "\n"
                    )

+                # Charts
+                if shape.has_chart:
+                    md_content += self._convert_chart_to_markdown(shape.chart)
+
                # Text areas
                elif shape.has_text_frame:
                    if shape == title:
@@ -624,6 +634,29 @@ class PptxConverter(HtmlConverter):
            return True
        return False

+    def _convert_chart_to_markdown(self, chart):
+        md = "\n\n### Chart"
+        if chart.has_title:
+            md += f": {chart.chart_title.text_frame.text}"
+        md += "\n\n"
+        data = []
+        category_names = [c.label for c in chart.plots[0].categories]
+        series_names = [s.name for s in chart.series]
+        data.append(["Category"] + series_names)
+
+        for idx, category in enumerate(category_names):
+            row = [category]
+            for series in chart.series:
+                row.append(series.values[idx])
+            data.append(row)
+
+        markdown_table = []
+        for row in data:
+            markdown_table.append("| " + " | ".join(map(str, row)) + " |")
+        header = markdown_table[0]
+        separator = "|" + "|".join(["---"] * len(data[0])) + "|"
+        return md + "\n".join([header, separator] + markdown_table[1:])
+

 class MediaConverter(DocumentConverter):
    """
@@ -762,7 +795,7 @@ class Mp3Converter(WavConverter):

 class ImageConverter(MediaConverter):
    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """

    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
@@ -792,17 +825,17 @@ class ImageConverter(MediaConverter):
                    md_content += f"{f}: {metadata[f]}\n"

        # Try describing the image with GPTV
-        mlm_client = kwargs.get("mlm_client")
-        mlm_model = kwargs.get("mlm_model")
-        if mlm_client is not None and mlm_model is not None:
+        llm_client = kwargs.get("llm_client")
+        llm_model = kwargs.get("llm_model")
+        if llm_client is not None and llm_model is not None:
            md_content += (
                "\n# Description:\n"
-                + self._get_mlm_description(
+                + self._get_llm_description(
                    local_path,
                    extension,
-                    mlm_client,
-                    mlm_model,
-                    prompt=kwargs.get("mlm_prompt"),
+                    llm_client,
+                    llm_model,
+                    prompt=kwargs.get("llm_prompt"),
                ).strip()
                + "\n"
            )
@@ -812,12 +845,10 @@ class ImageConverter(MediaConverter):
            text_content=md_content,
        )

-    def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."

-        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
-
        data_uri = ""
        with open(local_path, "rb") as image_file:
            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
@@ -845,126 +876,122 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content


-class GitHubIssueConverter(DocumentConverter):
-    """Converts GitHub issues and pull requests to Markdown."""
+class ZipConverter(DocumentConverter):
+    """Converts ZIP files to markdown by extracting and converting all contained files.

-    def convert(self, github_url, github_token) -> Union[None, DocumentConverterResult]:
-        # Bail if not a valid GitHub issue or pull request URL
-        if github_url:
-            parsed_url = urlparse(github_url)
-            path_parts = parsed_url.path.strip("/").split("/")
-            if len(path_parts) < 4 or path_parts[2] not in ["issues", "pull"]:
-                return None
+    The converter extracts the ZIP contents to a temporary directory, processes each file
+    using appropriate converters based on file extensions, and then combines the results
+    into a single markdown document. The temporary directory is cleaned up after processing.

-            if not github_token:
-                raise ValueError(
-                    "GitHub token is not set. Cannot convert GitHub issue or pull request."
-                )
+    Example output format:
+    ```markdown
+    Content from the zip file `example.zip`:

-            if path_parts[2] == "issues":
-                return self._convert_github_issue(github_url, github_token)
-            elif path_parts[2] == "pull":
-                return self._convert_github_pr(github_url, github_token)
+    ## File: docs/readme.txt

-        return None
+    This is the content of readme.txt
+    Multiple lines are preserved

-    def _convert_github_issue(
-        self, issue_url: str, github_token: str
-    ) -> DocumentConverterResult:
-        """
-        Convert a GitHub issue to a markdown document.
-        Args:
-            issue_url (str): The URL of the GitHub issue to convert.
-            github_token (str): A GitHub token with access to the repository.
-        Returns:
-            DocumentConverterResult: The result containing the issue title and markdown content.
-        Raises:
-            ImportError: If the PyGithub library is not installed.
-            ValueError: If the provided URL is not a valid GitHub issue URL.
-        """
-        if not IS_GITHUB_ISSUE_CAPABLE:
-            raise ImportError(
-                "PyGithub is not installed. Please install it to use this feature."
+    ## File: images/example.jpg
+
+    ImageSize: 1920x1080
+    DateTimeOriginal: 2024-02-15 14:30:00
+    Description: A beautiful landscape photo
+
+    ## File: data/report.xlsx
+
+    ## Sheet1
+    | Column1 | Column2 | Column3 |
+    |---------|---------|---------|
+    | data1   | data2   | data3   |
+    | data4   | data5   | data6   |
+    ```
+
+    Key features:
+    - Maintains original file structure in headings
+    - Processes nested files recursively
+    - Uses appropriate converters for each file type
+    - Preserves formatting of converted content
+    - Cleans up temporary files after processing
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a ZIP
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".zip":
+            return None
+
+        # Get parent converters list if available
+        parent_converters = kwargs.get("_parent_converters", [])
+        if not parent_converters:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
            )

-        # Parse the issue URL
-        parsed_url = urlparse(issue_url)
-        path_parts = parsed_url.path.strip("/").split("/")
-        if len(path_parts) < 4 or path_parts[2] != "issues":
-            raise ValueError("Invalid GitHub issue URL")
-
-        owner, repo, _, issue_number = path_parts[:4]
-
-        # Authenticate with GitHub
-        g = Github(github_token)
-        repo = g.get_repo(f"{owner}/{repo}")
-        issue = repo.get_issue(int(issue_number))
-
-        # Convert issue details to markdown
-        markdown_content = f"# {issue.title}\n\n{issue.body}\n\n"
-        markdown_content += f"**State:** {issue.state}\n"
-        markdown_content += f"**Created at:** {issue.created_at}\n"
-        markdown_content += f"**Updated at:** {issue.updated_at}\n"
-        markdown_content += f"**Comments:**\n"
-
-        for comment in issue.get_comments():
-            markdown_content += (
-                f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
-            )
-
-        return DocumentConverterResult(
-            title=issue.title,
-            text_content=markdown_content,
+        extracted_zip_folder_name = (
+            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
        )
-
-    def _convert_github_pr(
-        self, pr_url: str, github_token: str
-    ) -> DocumentConverterResult:
-        """
-        Convert a GitHub pull request to a markdown document.
-        Args:
-            pr_url (str): The URL of the GitHub pull request to convert.
-            github_token (str): A GitHub token with access to the repository.
-        Returns:
-            DocumentConverterResult: The result containing the pull request title and markdown content.
-        Raises:
-            ImportError: If the PyGithub library is not installed.
-            ValueError: If the provided URL is not a valid GitHub pull request URL.
-        """
-        if not IS_GITHUB_ISSUE_CAPABLE:
-            raise ImportError(
-                "PyGithub is not installed. Please install it to use this feature."
-            )
-
-        # Parse the pull request URL
-        parsed_url = urlparse(pr_url)
-        path_parts = parsed_url.path.strip("/").split("/")
-        if len(path_parts) < 4 or path_parts[2] != "pull":
-            raise ValueError("Invalid GitHub pull request URL")
-
-        owner, repo, _, pr_number = path_parts[:4]
-
-        # Authenticate with GitHub
-        g = Github(github_token)
-        repo = g.get_repo(f"{owner}/{repo}")
-        pr = repo.get_pull(int(pr_number))
-
-        # Convert pull request details to markdown
-        markdown_content = f"# {pr.title}\n\n{pr.body}\n\n"
-        markdown_content += f"**State:** {pr.state}\n"
-        markdown_content += f"**Created at:** {pr.created_at}\n"
-        markdown_content += f"**Updated at:** {pr.updated_at}\n"
-        markdown_content += f"**Comments:**\n"
-
-        for comment in pr.get_issue_comments():
-            markdown_content += (
-                f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
-            )
-
-        return DocumentConverterResult(
-            title=pr.title,
-            text_content=markdown_content,
+        new_folder = os.path.normpath(
+            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
        )
+        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
+
+        # Safety check for path traversal
+        if not new_folder.startswith(os.path.dirname(local_path)):
+            return DocumentConverterResult(
+                title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
+            )
+
+        try:
+            # Extract the zip file
+            with zipfile.ZipFile(local_path, "r") as zipObj:
+                zipObj.extractall(path=new_folder)
+
+            # Process each extracted file
+            for root, dirs, files in os.walk(new_folder):
+                for name in files:
+                    file_path = os.path.join(root, name)
+                    relative_path = os.path.relpath(file_path, new_folder)
+
+                    # Get file extension
+                    _, file_extension = os.path.splitext(name)
+
+                    # Update kwargs for the file
+                    file_kwargs = kwargs.copy()
+                    file_kwargs["file_extension"] = file_extension
+                    file_kwargs["_parent_converters"] = parent_converters
+
+                    # Try converting the file using available converters
+                    for converter in parent_converters:
+                        # Skip the zip converter to avoid infinite recursion
+                        if isinstance(converter, ZipConverter):
+                            continue
+
+                        result = converter.convert(file_path, **file_kwargs)
+                        if result is not None:
+                            md_content += f"\n## File: {relative_path}\n\n"
+                            md_content += result.text_content + "\n\n"
+                            break
+
+            # Clean up extracted files if specified
+            if kwargs.get("cleanup_extracted", True):
+                shutil.rmtree(new_folder)
+
+            return DocumentConverterResult(title=None, text_content=md_content.strip())
+
+        except zipfile.BadZipFile:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
+            )
+        except Exception as e:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
+            )


 class FileConversionException(BaseException):
@@ -982,16 +1009,50 @@ class MarkItDown:
    def __init__(
        self,
        requests_session: Optional[requests.Session] = None,
+        llm_client: Optional[Any] = None,
+        llm_model: Optional[str] = None,
+        style_map: Optional[str] = None,
+        # Deprecated
        mlm_client: Optional[Any] = None,
-        mlm_model: Optional[Any] = None,
+        mlm_model: Optional[str] = None,
    ):
        if requests_session is None:
            self._requests_session = requests.Session()
        else:
            self._requests_session = requests_session

-        self._mlm_client = mlm_client
-        self._mlm_model = mlm_model
+        # Handle deprecation notices
+        #############################
+        if mlm_client is not None:
+            if llm_client is None:
+                warn(
+                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
+                    DeprecationWarning,
+                )
+                llm_client = mlm_client
+                mlm_client = None
+            else:
+                raise ValueError(
+                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
+                )
+
+        if mlm_model is not None:
+            if llm_model is None:
+                warn(
+                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
+                    DeprecationWarning,
+                )
+                llm_model = mlm_model
+                mlm_model = None
+            else:
+                raise ValueError(
+                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
+                )
+        #############################
+
+        self._llm_client = llm_client
+        self._llm_model = llm_model
+        self._style_map = style_map

        self._page_converters: List[DocumentConverter] = []

@@ -1010,6 +1071,7 @@ class MarkItDown:
        self.register_page_converter(Mp3Converter())
        self.register_page_converter(ImageConverter())
        self.register_page_converter(PdfConverter())
+        self.register_page_converter(ZipConverter())

    def convert(
        self, source: Union[str, requests.Response], **kwargs: Any
@@ -1019,6 +1081,7 @@ class MarkItDown:
            - source: can be a string representing a path or url, or a requests.response object
            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
        """
+
        # Local path or url
        if isinstance(source, str):
            if (
@@ -1033,28 +1096,6 @@ class MarkItDown:
        elif isinstance(source, requests.Response):
            return self.convert_response(source, **kwargs)

-    def convert_url(
-        self, url: str, **kwargs: Any
-    ) -> DocumentConverterResult:  # TODO: fix kwargs type
-        # Handle GitHub issue and pull request URLs directly
-        parsed_url = urlparse(url)
-        if parsed_url.hostname == "github.com" and any(
-            x in parsed_url.path for x in ["/issues/", "/pull/"]
-        ):
-            github_token = kwargs.get("github_token", os.getenv("GITHUB_TOKEN"))
-            if not github_token:
-                raise ValueError(
-                    "GitHub token is required for GitHub issue or pull request conversion."
-                )
-            return GitHubIssueConverter().convert(
-                github_url=url, github_token=github_token
-            )
-
-        # Send a HTTP request to the URL
-        response = self._requests_session.get(url, stream=True)
-        response.raise_for_status()
-        return self.convert_response(response, **kwargs)
-
    def convert_local(
        self, path: str, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
@@ -1109,6 +1150,14 @@ class MarkItDown:

        return result

+    def convert_url(
+        self, url: str, **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO: fix kwargs type
+        # Send a HTTP request to the URL
+        response = self._requests_session.get(url, stream=True)
+        response.raise_for_status()
+        return self.convert_response(response, **kwargs)
+
    def convert_response(
        self, response: requests.Response, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO fix kwargs type
@@ -1146,7 +1195,7 @@ class MarkItDown:
                self._append_ext(extensions, g)

            # Convert
-            result = self._convert(temp_path, extensions, url=response.url)
+            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
            try:
@@ -1173,11 +1222,17 @@ class MarkItDown:
                    _kwargs.update({"file_extension": ext})

                # Copy any additional global options
-                if "mlm_client" not in _kwargs and self._mlm_client is not None:
-                    _kwargs["mlm_client"] = self._mlm_client
+                if "llm_client" not in _kwargs and self._llm_client is not None:
+                    _kwargs["llm_client"] = self._llm_client

-                if "mlm_model" not in _kwargs and self._mlm_model is not None:
-                    _kwargs["mlm_model"] = self._mlm_model
+                if "llm_model" not in _kwargs and self._llm_model is not None:
+                    _kwargs["llm_model"] = self._llm_model
+
+                # Add the list of converters for nested processing
+                _kwargs["_parent_converters"] = self._page_converters
+
+                if "style_map" not in _kwargs and self._style_map is not None:
+                    _kwargs["style_map"] = self._style_map

                # If we hit an error log it and keep trying
                try:
@@ -1214,8 +1269,7 @@ class MarkItDown:
        if ext == "":
            return
        # if ext not in extensions:
-        if True:
-            extensions.append(ext)
+        extensions.append(ext)

    def _guess_ext_magic(self, path):
        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
--- a/tests/test_files/test.docx
+++ b/tests/test_files/test.docx
--- a/tests/test_files/test.jpg
+++ b/tests/test_files/test.jpg
--- a/tests/test_files/test.pptx
+++ b/tests/test_files/test.pptx
--- a/tests/test_files/test.xlsx
+++ b/tests/test_files/test.xlsx
--- a/tests/test_files/test_files.zip
+++ b/tests/test_files/test_files.zip
--- a/tests/test_files/test_llm.jpg
+++ b/tests/test_files/test_llm.jpg
--- a/tests/test_files/test_mskanji.csv
+++ b/tests/test_files/test_mskanji.csv
@@ -0,0 +1,4 @@
+<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
+<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
--- a/tests/test_files/test_with_comment.docx
+++ b/tests/test_files/test_with_comment.docx
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -6,11 +6,23 @@ import shutil
 import pytest
 import requests

+from warnings import catch_warnings, resetwarnings
+
 from markitdown import MarkItDown

 skip_remote = (
    True if os.environ.get("GITHUB_ACTIONS") else False
 )  # Don't run these tests in CI
+
+
+# Don't run the llm tests without a key and the client library
+skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
+try:
+    import openai
+except ModuleNotFoundError:
+    skip_llm = True
+
+# Skip exiftool tests if not installed
 skip_exiftool = shutil.which("exiftool") is None

 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -51,12 +63,25 @@ DOCX_TEST_STRINGS = [
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 ]

+DOCX_COMMENT_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "This is a test comment. 12df-321a",
+    "Yet another comment in the doc. 55yiyi-asd09",
+]
+
 PPTX_TEST_STRINGS = [
    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
    "1b92870d-e3b5-4e65-8153-919f4ff45592",
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
+    "2003",  # chart value
 ]

 BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@@ -87,9 +112,16 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]

-GITHUB_ISSUE_URL = "https://github.com/microsoft/autogen/issues/1421"
-GITHUB_PR_URL = "https://github.com/microsoft/autogen/pull/194"
-GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
+CSV_CP932_TEST_STRINGS = [
+    "名前,年齢,住所",
+    "佐藤太郎,30,東京",
+    "三木英子,25,大阪",
+    "髙橋淳,35,名古屋",
+]
+
+LLM_TEST_STRINGS = [
+    "5bda1dd6",
+]


@pytest.mark.skipif(
@@ -134,6 +166,24 @@ def test_markitdown_local() -> None:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content

+    # Test DOCX processing, with comments
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
+        style_map="comment-reference => ",
+    )
+    for test_string in DOCX_COMMENT_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
+    # Test DOCX processing, with comments and setting style_map on init
+    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
+    result = markitdown_with_style_map.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
+    )
+    for test_string in DOCX_COMMENT_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
    # Test PPTX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
    for test_string in PPTX_TEST_STRINGS:
@@ -148,6 +198,12 @@ def test_markitdown_local() -> None:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content

+    # Test ZIP file processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
+    for test_string in DOCX_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
    # Test Wikipedia processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
@@ -168,6 +224,12 @@ def test_markitdown_local() -> None:
    for test_string in SERP_TEST_STRINGS:
        assert test_string in text_content

+    ## Test non-UTF-8 encoding
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
+    text_content = result.text_content.replace("\\", "")
+    for test_string in CSV_CP932_TEST_STRINGS:
+        assert test_string in text_content
+

@pytest.mark.skipif(
    skip_exiftool,
@@ -183,28 +245,57 @@ def test_markitdown_exiftool() -> None:
        assert target in result.text_content


-@pytest.mark.skipif(
-    not GITHUB_TOKEN,
-    reason="GitHub token not provided",
-)
-def test_markitdown_github_issue() -> None:
-    markitdown = MarkItDown()
-    result = markitdown.convert(GITHUB_ISSUE_URL, github_token=GITHUB_TOKEN)
-    print(result.text_content)
-    assert "User-Defined Functions" in result.text_content
-    assert "closed" in result.text_content
-    assert "Comments:" in result.text_content
+def test_markitdown_deprecation() -> None:
+    try:
+        with catch_warnings(record=True) as w:
+            test_client = object()
+            markitdown = MarkItDown(mlm_client=test_client)
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_client == test_client
+    finally:
+        resetwarnings()
+
+    try:
+        with catch_warnings(record=True) as w:
+            markitdown = MarkItDown(mlm_model="gpt-4o")
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert markitdown._llm_model == "gpt-4o"
+    finally:
+        resetwarnings()
+
+    try:
+        test_client = object()
+        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
+        assert False
+    except ValueError:
+        pass
+
+    try:
+        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
+        assert False
+    except ValueError:
+        pass


@pytest.mark.skipif(
-    not GITHUB_TOKEN,
-    reason="GitHub token not provided",
+    skip_llm,
+    reason="do not run llm tests without a key",
 )
-def test_markitdown_github_pr() -> None:
-    markitdown = MarkItDown()
-    result = markitdown.convert(GITHUB_PR_URL, github_token=GITHUB_TOKEN)
-    print(result.text_content)
-    assert "faq" in result.text_content
+def test_markitdown_llm() -> None:
+    client = openai.OpenAI()
+    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
+
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
+
+    for test_string in LLM_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # This is not super precise. It would also accept "red square", "blue circle",
+    # "the square is not blue", etc. But it's sufficient for this test.
+    for test_string in ["red", "circle", "blue", "square"]:
+        assert test_string in result.text_content.lower()


 if __name__ == "__main__":
@@ -212,5 +303,5 @@ if __name__ == "__main__":
    test_markitdown_remote()
    test_markitdown_local()
    test_markitdown_exiftool()
-    test_markitdown_github_issue()
-    test_markitdown_github_pr()
+    test_markitdown_deprecation()
+    test_markitdown_llm()
Author	SHA1	Message	Date
afourney	3ce21a47ab	Merge pull request #102 from microsoft/bump_version Bump version.	2024-12-17 13:55:12 -08:00
Adam Fourney	9518c01d4e	Bump version.	2024-12-17 13:51:13 -08:00
afourney	22504551ef	Merge pull request #101 from microsoft/add_deprecation_warnings Added deprecation warnings for mlm_* arguments.	2024-12-17 13:49:44 -08:00
Adam Fourney	95188a4a27	Merge main.	2024-12-17 13:46:26 -08:00
afourney	e69d012b86	Merge pull request #100 from microsoft/add_llm_tests	2024-12-17 13:36:36 -08:00
Adam Fourney	03a7843a0a	Added deprecation warnings for mlm_* arguments.	2024-12-17 13:22:48 -08:00
Adam Fourney	248d64edd0	Added llm tests to the local test set.	2024-12-17 12:13:19 -08:00
gagb	ad5d4fb139	Merge pull request #77 from microsoft/kevinclb/main Kevinclb/main	2024-12-16 18:14:09 -08:00
gagb	ad29122592	run precommit	2024-12-16 18:09:48 -08:00
gagb	898bfd4774	Merge branch 'main' into main	2024-12-16 18:00:26 -08:00
gagb	c8980d9f41	Merge pull request #75 from microsoft/cybernobie/main Cybernobie/main	2024-12-16 17:40:13 -08:00
gagb	24b52b2b8f	Improve readme	2024-12-16 17:35:47 -08:00
gagb	09159aa04e	Merge branch 'main' into main	2024-12-16 17:24:47 -08:00
gagb	77f620b568	Merge pull request #67 from DIMAX99/issue#65 fix issue #65	2024-12-16 17:18:53 -08:00
gagb	825d3bbb77	Merge branch 'main' into issue#65	2024-12-16 17:09:53 -08:00
gagb	c0127af120	Merge pull request #72 from CharlesCNorton/patch-1 Fix LLM terms	2024-12-16 17:06:24 -08:00
gagb	33cb5015eb	Merge branch 'main' into patch-1	2024-12-16 17:04:44 -08:00
gagb	cf13b7e657	Merge pull request #73 from CharlesCNorton/patch-2 Fix LLM terminology in code	2024-12-16 17:04:33 -08:00
gagb	874eba6265	Merge branch 'main' into patch-2	2024-12-16 16:59:22 -08:00
gagb	c3fa2934b9	Run pre-commit	2024-12-16 16:56:52 -08:00
gagb	736e7d9a7e	Merge branch 'main' into patch-1	2024-12-16 16:53:58 -08:00
gagb	19c111251b	Merge pull request #60 from madduci/main Added Dockerfile	2024-12-16 16:42:26 -08:00
gagb	360c2dd95f	Merge branch 'main' into main	2024-12-16 16:35:50 -08:00
kevinbabou	87846cf5f8	rm setup.py	2024-12-16 16:28:44 -08:00
kevinbabou	33638f1fe6	feature: add argument parsing and setup.py file for cli tool capability	2024-12-16 16:28:44 -08:00
gagb	73776b2c0f	Merge pull request #50 from narumiruna/youtube-transcript-languages Support specifying YouTube transcript language	2024-12-16 16:23:20 -08:00
gagb	2d3ffeade1	Merge branch 'main' into youtube-transcript-languages	2024-12-16 16:20:35 -08:00
gagb	51c1453699	Merge pull request #48 from Soulter/main Fix: pass the kwargs to _convert method when converting an url file	2024-12-16 16:19:09 -08:00
gagb	ae4669107c	Merge branch 'main' into main	2024-12-16 16:01:59 -08:00
gagb	b0115cf971	Merge branch 'main' into youtube-transcript-languages	2024-12-16 15:47:38 -08:00
gagb	5cf8474f37	Merge pull request #44 from Y-Kim-64/main Exclude test files from language statistics using linguist-vendored	2024-12-16 15:35:19 -08:00
gagb	83dc81170b	Merge branch 'main' into main	2024-12-16 15:29:33 -08:00
gagb	e7a2e20d93	Merge pull request #39 from SH4DOW4RE/main Catching pydub's warning of ffmpeg or avconv missing	2024-12-16 15:28:53 -08:00
gagb	980abd3a60	Merge branch 'main' into main	2024-12-16 15:24:58 -08:00
afourney	6587e0f097	Merge branch 'main' into patch-1	2024-12-16 14:27:26 -08:00
afourney	978c8763aa	Merge pull request #38 from VillePuuska/support-comments-in-docx Add passing style_map kwarg to Mammoth when converting docx to allow keeping comments	2024-12-16 14:26:55 -08:00
afourney	e7636656d8	Merge branch 'main' into support-comments-in-docx	2024-12-16 14:23:14 -08:00
afourney	ddc1bebea4	Merge branch 'main' into patch-2	2024-12-16 14:20:16 -08:00
afourney	fa1f496d51	Merge branch 'main' into patch-1	2024-12-16 14:18:20 -08:00
afourney	da779dd125	Merge pull request #33 from nyosegawa/feature/add-pptx-chart-support Add PPTX chart support	2024-12-16 14:11:49 -08:00
afourney	12ce5e95b2	Merge branch 'main' into feature/add-pptx-chart-support	2024-12-16 14:06:14 -08:00
gagb	6dad1cca96	Merge pull request #22 from Josh-XT/main Add zip handling	2024-12-16 13:56:25 -08:00
gagb	9e6a19987b	Merge branch 'main' into main	2024-12-16 13:51:39 -08:00
gagb	ed91e8b534	Merge pull request #19 from brc-dd/fix/18 Fix character decoding issues with text-like files	2024-12-16 13:49:48 -08:00
gagb	aeff2cb5ae	Merge branch 'main' into fix/18	2024-12-16 13:46:17 -08:00
gagb	c9c7d98d30	Merge pull request #11 from simonw/patch-2 CLI usage instructions	2024-12-16 13:45:05 -08:00
gagb	e7d9b5546a	Merge branch 'main' into patch-2	2024-12-16 13:42:28 -08:00
CharlesCNorton	ed651aeb16	Fix LLM terminology in code Replaced all occurrences of mlm_client and mlm_model with llm_client and llm_model for consistent terminology when referencing Large Language Models (LLMs).	2024-12-16 16:23:52 -05:00
CharlesCNorton	3d9f3f3e5b	Fix LLM terms Updated all instances of mlm_client and mlm_model to llm_client and llm_model in the readme. The previous terms (mlm_client and mlm_model) are incorrect in the context of configuring Large Language Models (LLMs), as "MLM" typically refers to Masked Language Models, which is unrelated to the intended functionality. This change aligns the documentation with standard naming conventions for LLM configuration parameters and improves clarity for users integrating with LLMs like OpenAI's GPT models.	2024-12-16 16:23:03 -05:00
Divit	ad01da308d	fix issue #65	2024-12-16 21:48:33 +05:30
CyberNobie	010f841008	Ensure hatch is installed before running tests	2024-12-16 18:47:24 +05:30
Michele Adduci	5fc03b6415	Added UID as argument	2024-12-16 13:11:13 +01:00
Michele Adduci	013b022427	Added Docker Image for using markitdown in a sandboxed environment	2024-12-16 13:08:15 +01:00
narumi	695100d5d8	Support specifying YouTube transcript language	2024-12-16 13:16:00 +08:00
Soulter	d66ef5fcca	Update README to introduce the customized mlm_prompt	2024-12-16 12:08:51 +08:00
Soulter	c168703d5e	Pass the kwargs to _convert method when converting an url file	2024-12-16 11:41:39 +08:00
Yeonjun	3548c96dd3	Create .gitattributes Mark test files as linguist-vendored	2024-12-16 09:21:07 +09:00
SH4DOW4RE	1559d9d163	pre-commit ran	2024-12-15 22:15:20 +01:00
SH4DOW4RE	b7f5662ffd	PR: Catching pydub's warning of ffmpeg or avconv missing	2024-12-15 17:29:14 +01:00
Ville Puuska	0a7203b876	add style_map prop to MarkItDown class	2024-12-15 17:23:57 +02:00
Ville Puuska	0704b0b6ff	pass 'style_map' kwarg to mammoth when converting docx	2024-12-15 16:59:21 +02:00
sakasegawa	0dd4e95584	Remove _is_chart	2024-12-15 21:14:58 +09:00
sakasegawa	93130b5ba5	Add PPTX chart support	2024-12-15 20:42:55 +09:00
Divyansh Singh	52b723724c	Fix character decoding issues with text-like files	2024-12-15 10:37:59 +05:30
Josh XT	a55c3d525c	Merge branch 'main' into main	2024-12-14 23:09:30 -05:00
gagb	81e3f24acd	Merge pull request #29 from microsoft/gagb-patch-1 Update README.md	2024-12-14 19:17:54 -08:00
gagb	b84294620a	Update README.md	2024-12-14 19:05:51 -08:00
gagb	60c495d609	Merge branch 'main' into patch-2	2024-12-14 18:57:11 -08:00
gagb	71123a4df3	Merge pull request #7 from microsoft/gagb/improve-readme Improve the readme with contributing guidelines	2024-12-14 18:54:28 -08:00
gagb	5753e553fe	Fix conflicts	2024-12-14 18:47:34 -08:00
gagb	752dd897b9	Merge pull request #28 from pawarbi/main Update README.md	2024-12-14 18:44:52 -08:00
gagb	1aa4abe90f	Merge branch 'gagb/improve-readme' into main	2024-12-14 18:44:33 -08:00
gagb	ea7c6dcc40	Merge pull request #27 from haesleinhuepf/patch-1 Add installation instructions from haesleinhuepf:patch-1	2024-12-14 18:39:51 -08:00
gagb	a31c0a13e7	Merge branch 'main' into gagb/improve-readme	2024-12-14 18:34:27 -08:00
Sandeep Pawar	30ab78fe9e	Update README.md I have updated the readme with three changes: - Created sections for Installation and Usage to help users - Added installation instruction - Added additional example of using LLM. This will be the primary use case and will help users.	2024-12-14 19:15:10 -06:00
gagb	559b1fc62a	Merge branch 'main' into patch-2	2024-12-14 15:02:42 -08:00
Josh XT	df03382218	Improve docustring	2024-12-14 17:55:22 -05:00
Robert Haase	18301edcd0	Add installation instructions	2024-12-14 23:22:54 +01:00
Josh XT	4987201ef6	test	2024-12-14 08:49:03 -05:00
Josh XT	571c5bbc0e	add test	2024-12-14 08:45:51 -05:00
Josh XT	e8ea8b6f3d	Update readme	2024-12-14 08:41:07 -05:00
Josh XT	7e634acf5f	import zipfile	2024-12-14 08:24:44 -05:00
Josh XT	862c39029e	add zip handling	2024-12-14 06:34:47 -05:00
Simon Willison	33ce17954d	Note about piping	2024-12-13 11:09:03 -08:00
Simon Willison	6ebef5af0c	CLI usage instructions Plus added a PyPI badge	2024-12-13 11:06:11 -08:00
gagb	3f9ba06418	Improve the readme with contributing guidelines Addresses issue https://github.com/microsoft/markitdown/issues/6 --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX).	2024-12-12 15:17:18 -08:00