Merge pull request #102 from microsoft/bump_version

Bump version.
2024-12-17 13:55:12 -08:00 · 2024-12-17 13:51:13 -08:00 · 2024-12-17 13:49:44 -08:00 · 2024-12-17 13:46:26 -08:00 · 2024-12-17 13:36:36 -08:00 · 2024-12-17 13:22:48 -08:00
17 changed files with 457 additions and 207 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
 *
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
 tests/test_files/** linguist-vendored
--- a/16
+++ b/16
@@ -0,0 +1,16 @@
 FROM python:3.13-alpine
 USER root
 # Runtime dependency
 RUN apk add --no-cache ffmpeg
 RUN pip install markitdown
 # Default USERID and GROUPID
 ARG USERID=10000
 ARG GROUPID=10000
 USER $USERID:$GROUPID
 ENTRYPOINT [ "markitdown" ]
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # MarkItDown
 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
 The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
 It presently supports:
@@ -12,7 +14,23 @@ It presently supports:
 - Audio (EXIF metadata, and speech transcription)
 - HTML (special handling of Wikipedia, etc.)
 - Various other text-based formats (csv, json, xml, etc.)
 - ZIP (Iterates over contents and converts each file)
 # Installation
 You can install `markitdown` using pip:
 ```python
 pip install markitdown
 ```
 or from the source
 ```sh
 pip install -e .
 ```
 # Usage
 The API is simple:
 ```python
@@ -23,6 +41,44 @@ result = markitdown.convert("test.xlsx")
 print(result.text_content)
 ```
 To use this as a command-line utility, install it and then run it like this:
 ```bash
 markitdown path-to-file.pdf
 ```
 This will output Markdown to standard output. You can save it like this:
 ```bash
 markitdown path-to-file.pdf > document.md
 ```
 You can pipe content to standard input by omitting the argument:
 ```bash
 cat path-to-file.pdf | markitdown
 ```
 You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client.
 ```python
 from markitdown import MarkItDown
 from openai import OpenAI
 client = OpenAI()
 md = MarkItDown(llm_client=client, llm_model="gpt-4o")
 result = md.convert("example.jpg")
 print(result.text_content)
 ```
 You can also use the project as Docker Image:
 ```sh
 docker build -t markitdown:latest .
 docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
 ```
 ## Contributing
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
@@ -37,6 +93,24 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope
 For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
 contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
 ### Running Tests
 To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install).
 ```sh
 pip install hatch
 hatch shell
 hatch test
 ```
 ### Running Pre-commit Checks
 Please run the pre-commit checks before submitting a PR.
 ```sh
 pre-commit run --all-files
 ```
 ## Trademarks
 This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,8 @@ dependencies = [
  "youtube-transcript-api",
  "SpeechRecognition",
  "pathvalidate",
-  "pygithub"
+  "charset-normalizer",
  "openai",
 ]
 [project.urls]
@@ -77,3 +78,6 @@ exclude_lines = [
  "if __name__ == .__main__.:",
  "if TYPE_CHECKING:",
 ]
 [tool.hatch.build.targets.sdist]
 only-include = ["src/markitdown"]
--- a/src/markitdown/about.py
+++ b/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a1"
+__version__ = "0.0.1a3"
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -2,21 +2,15 @@
 #
 # SPDX-License-Identifier: MIT
 import sys
 import argparse
 from ._markitdown import MarkItDown
 def main():
-    if len(sys.argv) == 1:
+    parser = argparse.ArgumentParser(
-        markitdown = MarkItDown()
+        description="Convert various file formats to markdown.",
-        result = markitdown.convert_stream(sys.stdin.buffer)
+        formatter_class=argparse.RawDescriptionHelpFormatter,
-        print(result.text_content)
+        usage="""
    elif len(sys.argv) == 2:
        markitdown = MarkItDown()
        result = markitdown.convert(sys.argv[1])
        print(result.text_content)
    else:
        sys.stderr.write(
            """
 SYNTAX: 
    markitdown <OPTIONAL: FILENAME>
@@ -33,9 +27,20 @@ EXAMPLE:
    OR 
    markitdown < example.pdf
-""".strip()
+""".strip(),
-            + "\n"
+    )
-        )
+
    parser.add_argument("filename", nargs="?")
    args = parser.parse_args()
    if args.filename is None:
        markitdown = MarkItDown()
        result = markitdown.convert_stream(sys.stdin.buffer)
        print(result.text_content)
    else:
        markitdown = MarkItDown()
        result = markitdown.convert(args.filename)
        print(result.text_content)
 if __name__ == "__main__":
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -12,8 +12,10 @@ import subprocess
 import sys
 import tempfile
 import traceback
 import zipfile
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from warnings import warn, resetwarnings, catch_warnings
 import mammoth
 import markdownify
@@ -26,15 +28,24 @@ import pptx
 import puremagic
 import requests
 from bs4 import BeautifulSoup
 from charset_normalizer import from_path
 # Optional Transcription support
 try:
-    import pydub
+    # Using warnings' catch_warnings to catch
    # pydub's warning of ffmpeg or avconv missing
    with catch_warnings(record=True) as w:
        import pydub
        if w:
            raise ModuleNotFoundError
    import speech_recognition as sr
    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
 except ModuleNotFoundError:
    pass
 finally:
    resetwarnings()
 # Optional YouTube transcription support
 try:
@@ -44,14 +55,6 @@ try:
 except ModuleNotFoundError:
    pass
 # Optional GitHub issue support
 try:
    from github import Github
    IS_GITHUB_ISSUE_CAPABLE = True
 except ModuleNotFoundError:
    IS_GITHUB_ISSUE_CAPABLE = False
 class _CustomMarkdownify(markdownify.MarkdownConverter):
    """
@@ -169,9 +172,7 @@ class PlainTextConverter(DocumentConverter):
        elif "text/" not in content_type.lower():
            return None
-        text_content = ""
+        text_content = str(from_path(local_path).best())
        with open(local_path, "rt", encoding="utf-8") as fh:
            text_content = fh.read()
        return DocumentConverterResult(
            title=None,
            text_content=text_content,
@@ -352,8 +353,11 @@ class YouTubeConverter(DocumentConverter):
                assert isinstance(params["v"][0], str)
                video_id = str(params["v"][0])
                try:
                    youtube_transcript_languages = kwargs.get(
                        "youtube_transcript_languages", ("en",)
                    )
                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id)  # type: ignore
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
@@ -500,7 +504,9 @@ class DocxConverter(HtmlConverter):
        result = None
        with open(local_path, "rb") as docx_file:
-            result = mammoth.convert_to_html(docx_file)
+            style_map = kwargs.get("style_map", None)
            result = mammoth.convert_to_html(docx_file, style_map=style_map)
            html_content = result.value
            result = self._convert(html_content)
@@ -590,6 +596,10 @@ class PptxConverter(HtmlConverter):
                        "\n" + self._convert(html_table).text_content.strip() + "\n"
                    )
                # Charts
                if shape.has_chart:
                    md_content += self._convert_chart_to_markdown(shape.chart)
                # Text areas
                elif shape.has_text_frame:
                    if shape == title:
@@ -624,6 +634,29 @@ class PptxConverter(HtmlConverter):
            return True
        return False
    def _convert_chart_to_markdown(self, chart):
        md = "\n\n### Chart"
        if chart.has_title:
            md += f": {chart.chart_title.text_frame.text}"
        md += "\n\n"
        data = []
        category_names = [c.label for c in chart.plots[0].categories]
        series_names = [s.name for s in chart.series]
        data.append(["Category"] + series_names)
        for idx, category in enumerate(category_names):
            row = [category]
            for series in chart.series:
                row.append(series.values[idx])
            data.append(row)
        markdown_table = []
        for row in data:
            markdown_table.append("| " + " | ".join(map(str, row)) + " |")
        header = markdown_table[0]
        separator = "|" + "|".join(["---"] * len(data[0])) + "|"
        return md + "\n".join([header, separator] + markdown_table[1:])
 class MediaConverter(DocumentConverter):
    """
@@ -762,7 +795,7 @@ class Mp3Converter(WavConverter):
 class ImageConverter(MediaConverter):
    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """
    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
@@ -792,17 +825,17 @@ class ImageConverter(MediaConverter):
                    md_content += f"{f}: {metadata[f]}\n"
        # Try describing the image with GPTV
-        mlm_client = kwargs.get("mlm_client")
+        llm_client = kwargs.get("llm_client")
-        mlm_model = kwargs.get("mlm_model")
+        llm_model = kwargs.get("llm_model")
-        if mlm_client is not None and mlm_model is not None:
+        if llm_client is not None and llm_model is not None:
            md_content += (
                "\n# Description:\n"
-                + self._get_mlm_description(
+                + self._get_llm_description(
                    local_path,
                    extension,
-                    mlm_client,
+                    llm_client,
-                    mlm_model,
+                    llm_model,
-                    prompt=kwargs.get("mlm_prompt"),
+                    prompt=kwargs.get("llm_prompt"),
                ).strip()
                + "\n"
            )
@@ -812,12 +845,10 @@ class ImageConverter(MediaConverter):
            text_content=md_content,
        )
-    def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."
        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
        data_uri = ""
        with open(local_path, "rb") as image_file:
            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
@@ -845,126 +876,122 @@ class ImageConverter(MediaConverter):
        return response.choices[0].message.content
-class GitHubIssueConverter(DocumentConverter):
+class ZipConverter(DocumentConverter):
-    """Converts GitHub issues and pull requests to Markdown."""
+    """Converts ZIP files to markdown by extracting and converting all contained files.
-    def convert(self, github_url, github_token) -> Union[None, DocumentConverterResult]:
+    The converter extracts the ZIP contents to a temporary directory, processes each file
-        # Bail if not a valid GitHub issue or pull request URL
+    using appropriate converters based on file extensions, and then combines the results
-        if github_url:
+    into a single markdown document. The temporary directory is cleaned up after processing.
            parsed_url = urlparse(github_url)
            path_parts = parsed_url.path.strip("/").split("/")
            if len(path_parts) < 4 or path_parts[2] not in ["issues", "pull"]:
                return None
-            if not github_token:
+    Example output format:
-                raise ValueError(
+    ```markdown
-                    "GitHub token is not set. Cannot convert GitHub issue or pull request."
+    Content from the zip file `example.zip`:
                )
-            if path_parts[2] == "issues":
+    ## File: docs/readme.txt
                return self._convert_github_issue(github_url, github_token)
            elif path_parts[2] == "pull":
                return self._convert_github_pr(github_url, github_token)
-        return None
+    This is the content of readme.txt
    Multiple lines are preserved
-    def _convert_github_issue(
+    ## File: images/example.jpg
-        self, issue_url: str, github_token: str
+
-    ) -> DocumentConverterResult:
+    ImageSize: 1920x1080
-        """
+    DateTimeOriginal: 2024-02-15 14:30:00
-        Convert a GitHub issue to a markdown document.
+    Description: A beautiful landscape photo
-        Args:
+
-            issue_url (str): The URL of the GitHub issue to convert.
+    ## File: data/report.xlsx
-            github_token (str): A GitHub token with access to the repository.
+
-        Returns:
+    ## Sheet1
-            DocumentConverterResult: The result containing the issue title and markdown content.
+    | Column1 | Column2 | Column3 |
-        Raises:
+    |---------|---------|---------|
-            ImportError: If the PyGithub library is not installed.
+    | data1   | data2   | data3   |
-            ValueError: If the provided URL is not a valid GitHub issue URL.
+    | data4   | data5   | data6   |
-        """
+    ```
-        if not IS_GITHUB_ISSUE_CAPABLE:
+
-            raise ImportError(
+    Key features:
-                "PyGithub is not installed. Please install it to use this feature."
+    - Maintains original file structure in headings
    - Processes nested files recursively
    - Uses appropriate converters for each file type
    - Preserves formatting of converted content
    - Cleans up temporary files after processing
    """
    def convert(
        self, local_path: str, **kwargs: Any
    ) -> Union[None, DocumentConverterResult]:
        # Bail if not a ZIP
        extension = kwargs.get("file_extension", "")
        if extension.lower() != ".zip":
            return None
        # Get parent converters list if available
        parent_converters = kwargs.get("_parent_converters", [])
        if not parent_converters:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
            )
-        # Parse the issue URL
+        extracted_zip_folder_name = (
-        parsed_url = urlparse(issue_url)
+            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
        path_parts = parsed_url.path.strip("/").split("/")
        if len(path_parts) < 4 or path_parts[2] != "issues":
            raise ValueError("Invalid GitHub issue URL")
        owner, repo, _, issue_number = path_parts[:4]
        # Authenticate with GitHub
        g = Github(github_token)
        repo = g.get_repo(f"{owner}/{repo}")
        issue = repo.get_issue(int(issue_number))
        # Convert issue details to markdown
        markdown_content = f"# {issue.title}\n\n{issue.body}\n\n"
        markdown_content += f"**State:** {issue.state}\n"
        markdown_content += f"**Created at:** {issue.created_at}\n"
        markdown_content += f"**Updated at:** {issue.updated_at}\n"
        markdown_content += f"**Comments:**\n"
        for comment in issue.get_comments():
            markdown_content += (
                f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
            )
        return DocumentConverterResult(
            title=issue.title,
            text_content=markdown_content,
        )
-
+        new_folder = os.path.normpath(
-    def _convert_github_pr(
+            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
        self, pr_url: str, github_token: str
    ) -> DocumentConverterResult:
        """
        Convert a GitHub pull request to a markdown document.
        Args:
            pr_url (str): The URL of the GitHub pull request to convert.
            github_token (str): A GitHub token with access to the repository.
        Returns:
            DocumentConverterResult: The result containing the pull request title and markdown content.
        Raises:
            ImportError: If the PyGithub library is not installed.
            ValueError: If the provided URL is not a valid GitHub pull request URL.
        """
        if not IS_GITHUB_ISSUE_CAPABLE:
            raise ImportError(
                "PyGithub is not installed. Please install it to use this feature."
            )
        # Parse the pull request URL
        parsed_url = urlparse(pr_url)
        path_parts = parsed_url.path.strip("/").split("/")
        if len(path_parts) < 4 or path_parts[2] != "pull":
            raise ValueError("Invalid GitHub pull request URL")
        owner, repo, _, pr_number = path_parts[:4]
        # Authenticate with GitHub
        g = Github(github_token)
        repo = g.get_repo(f"{owner}/{repo}")
        pr = repo.get_pull(int(pr_number))
        # Convert pull request details to markdown
        markdown_content = f"# {pr.title}\n\n{pr.body}\n\n"
        markdown_content += f"**State:** {pr.state}\n"
        markdown_content += f"**Created at:** {pr.created_at}\n"
        markdown_content += f"**Updated at:** {pr.updated_at}\n"
        markdown_content += f"**Comments:**\n"
        for comment in pr.get_issue_comments():
            markdown_content += (
                f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
            )
        return DocumentConverterResult(
            title=pr.title,
            text_content=markdown_content,
        )
        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
        # Safety check for path traversal
        if not new_folder.startswith(os.path.dirname(local_path)):
            return DocumentConverterResult(
                title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
            )
        try:
            # Extract the zip file
            with zipfile.ZipFile(local_path, "r") as zipObj:
                zipObj.extractall(path=new_folder)
            # Process each extracted file
            for root, dirs, files in os.walk(new_folder):
                for name in files:
                    file_path = os.path.join(root, name)
                    relative_path = os.path.relpath(file_path, new_folder)
                    # Get file extension
                    _, file_extension = os.path.splitext(name)
                    # Update kwargs for the file
                    file_kwargs = kwargs.copy()
                    file_kwargs["file_extension"] = file_extension
                    file_kwargs["_parent_converters"] = parent_converters
                    # Try converting the file using available converters
                    for converter in parent_converters:
                        # Skip the zip converter to avoid infinite recursion
                        if isinstance(converter, ZipConverter):
                            continue
                        result = converter.convert(file_path, **file_kwargs)
                        if result is not None:
                            md_content += f"\n## File: {relative_path}\n\n"
                            md_content += result.text_content + "\n\n"
                            break
            # Clean up extracted files if specified
            if kwargs.get("cleanup_extracted", True):
                shutil.rmtree(new_folder)
            return DocumentConverterResult(title=None, text_content=md_content.strip())
        except zipfile.BadZipFile:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
            )
        except Exception as e:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
            )
 class FileConversionException(BaseException):
@@ -982,16 +1009,50 @@ class MarkItDown:
    def __init__(
        self,
        requests_session: Optional[requests.Session] = None,
        llm_client: Optional[Any] = None,
        llm_model: Optional[str] = None,
        style_map: Optional[str] = None,
        # Deprecated
        mlm_client: Optional[Any] = None,
-        mlm_model: Optional[Any] = None,
+        mlm_model: Optional[str] = None,
    ):
        if requests_session is None:
            self._requests_session = requests.Session()
        else:
            self._requests_session = requests_session
-        self._mlm_client = mlm_client
+        # Handle deprecation notices
-        self._mlm_model = mlm_model
+        #############################
        if mlm_client is not None:
            if llm_client is None:
                warn(
                    "'mlm_client' is deprecated, and was renamed 'llm_client'.",
                    DeprecationWarning,
                )
                llm_client = mlm_client
                mlm_client = None
            else:
                raise ValueError(
                    "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
                )
        if mlm_model is not None:
            if llm_model is None:
                warn(
                    "'mlm_model' is deprecated, and was renamed 'llm_model'.",
                    DeprecationWarning,
                )
                llm_model = mlm_model
                mlm_model = None
            else:
                raise ValueError(
                    "'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
                )
        #############################
        self._llm_client = llm_client
        self._llm_model = llm_model
        self._style_map = style_map
        self._page_converters: List[DocumentConverter] = []
@@ -1010,6 +1071,7 @@ class MarkItDown:
        self.register_page_converter(Mp3Converter())
        self.register_page_converter(ImageConverter())
        self.register_page_converter(PdfConverter())
        self.register_page_converter(ZipConverter())
    def convert(
        self, source: Union[str, requests.Response], **kwargs: Any
@@ -1019,6 +1081,7 @@ class MarkItDown:
            - source: can be a string representing a path or url, or a requests.response object
            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
        """
        # Local path or url
        if isinstance(source, str):
            if (
@@ -1033,28 +1096,6 @@ class MarkItDown:
        elif isinstance(source, requests.Response):
            return self.convert_response(source, **kwargs)
    def convert_url(
        self, url: str, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: fix kwargs type
        # Handle GitHub issue and pull request URLs directly
        parsed_url = urlparse(url)
        if parsed_url.hostname == "github.com" and any(
            x in parsed_url.path for x in ["/issues/", "/pull/"]
        ):
            github_token = kwargs.get("github_token", os.getenv("GITHUB_TOKEN"))
            if not github_token:
                raise ValueError(
                    "GitHub token is required for GitHub issue or pull request conversion."
                )
            return GitHubIssueConverter().convert(
                github_url=url, github_token=github_token
            )
        # Send a HTTP request to the URL
        response = self._requests_session.get(url, stream=True)
        response.raise_for_status()
        return self.convert_response(response, **kwargs)
    def convert_local(
        self, path: str, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: deal with kwargs
@@ -1109,6 +1150,14 @@ class MarkItDown:
        return result
    def convert_url(
        self, url: str, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO: fix kwargs type
        # Send a HTTP request to the URL
        response = self._requests_session.get(url, stream=True)
        response.raise_for_status()
        return self.convert_response(response, **kwargs)
    def convert_response(
        self, response: requests.Response, **kwargs: Any
    ) -> DocumentConverterResult:  # TODO fix kwargs type
@@ -1146,7 +1195,7 @@ class MarkItDown:
                self._append_ext(extensions, g)
            # Convert
-            result = self._convert(temp_path, extensions, url=response.url)
+            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
            try:
@@ -1173,11 +1222,17 @@ class MarkItDown:
                    _kwargs.update({"file_extension": ext})
                # Copy any additional global options
-                if "mlm_client" not in _kwargs and self._mlm_client is not None:
+                if "llm_client" not in _kwargs and self._llm_client is not None:
-                    _kwargs["mlm_client"] = self._mlm_client
+                    _kwargs["llm_client"] = self._llm_client
-                if "mlm_model" not in _kwargs and self._mlm_model is not None:
+                if "llm_model" not in _kwargs and self._llm_model is not None:
-                    _kwargs["mlm_model"] = self._mlm_model
+                    _kwargs["llm_model"] = self._llm_model
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._page_converters
                if "style_map" not in _kwargs and self._style_map is not None:
                    _kwargs["style_map"] = self._style_map
                # If we hit an error log it and keep trying
                try:
@@ -1214,8 +1269,7 @@ class MarkItDown:
        if ext == "":
            return
        # if ext not in extensions:
-        if True:
+        extensions.append(ext)
            extensions.append(ext)
    def _guess_ext_magic(self, path):
        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
--- a/tests/test_files/test.docx
+++ b/tests/test_files/test.docx
--- a/tests/test_files/test.jpg
+++ b/tests/test_files/test.jpg
--- a/tests/test_files/test.pptx
+++ b/tests/test_files/test.pptx
--- a/tests/test_files/test.xlsx
+++ b/tests/test_files/test.xlsx
--- a/tests/test_files/test_files.zip
+++ b/tests/test_files/test_files.zip
--- a/tests/test_files/test_llm.jpg
+++ b/tests/test_files/test_llm.jpg
--- a/tests/test_files/test_mskanji.csv
+++ b/tests/test_files/test_mskanji.csv
@@ -0,0 +1,4 @@
 <EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
 <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
 <EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
 <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
--- a/tests/test_files/test_with_comment.docx
+++ b/tests/test_files/test_with_comment.docx
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -6,11 +6,23 @@ import shutil
 import pytest
 import requests
 from warnings import catch_warnings, resetwarnings
 from markitdown import MarkItDown
 skip_remote = (
    True if os.environ.get("GITHUB_ACTIONS") else False
 )  # Don't run these tests in CI
 # Don't run the llm tests without a key and the client library
 skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
 try:
    import openai
 except ModuleNotFoundError:
    skip_llm = True
 # Skip exiftool tests if not installed
 skip_exiftool = shutil.which("exiftool") is None
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -51,12 +63,25 @@ DOCX_TEST_STRINGS = [
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
 ]
 DOCX_COMMENT_TEST_STRINGS = [
    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
    "49e168b7-d2ae-407f-a055-2167576f39a1",
    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
    "# Abstract",
    "# Introduction",
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    "This is a test comment. 12df-321a",
    "Yet another comment in the doc. 55yiyi-asd09",
 ]
 PPTX_TEST_STRINGS = [
    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
    "1b92870d-e3b5-4e65-8153-919f4ff45592",
    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
    "2003",  # chart value
 ]
 BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@@ -87,9 +112,16 @@ SERP_TEST_EXCLUDES = [
    "data:image/svg+xml,%3Csvg%20width%3D",
 ]
-GITHUB_ISSUE_URL = "https://github.com/microsoft/autogen/issues/1421"
+CSV_CP932_TEST_STRINGS = [
-GITHUB_PR_URL = "https://github.com/microsoft/autogen/pull/194"
+    "名前,年齢,住所",
-GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
+    "佐藤太郎,30,東京",
    "三木英子,25,大阪",
    "髙橋淳,35,名古屋",
 ]
 LLM_TEST_STRINGS = [
    "5bda1dd6",
 ]
@pytest.mark.skipif(
@@ -134,6 +166,24 @@ def test_markitdown_local() -> None:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test DOCX processing, with comments
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
        style_map="comment-reference => ",
    )
    for test_string in DOCX_COMMENT_TEST_STRINGS:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test DOCX processing, with comments and setting style_map on init
    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
    result = markitdown_with_style_map.convert(
        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
    )
    for test_string in DOCX_COMMENT_TEST_STRINGS:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test PPTX processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
    for test_string in PPTX_TEST_STRINGS:
@@ -148,6 +198,12 @@ def test_markitdown_local() -> None:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test ZIP file processing
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
    for test_string in DOCX_TEST_STRINGS:
        text_content = result.text_content.replace("\\", "")
        assert test_string in text_content
    # Test Wikipedia processing
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
@@ -168,6 +224,12 @@ def test_markitdown_local() -> None:
    for test_string in SERP_TEST_STRINGS:
        assert test_string in text_content
    ## Test non-UTF-8 encoding
    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
    text_content = result.text_content.replace("\\", "")
    for test_string in CSV_CP932_TEST_STRINGS:
        assert test_string in text_content
@pytest.mark.skipif(
    skip_exiftool,
@@ -183,28 +245,57 @@ def test_markitdown_exiftool() -> None:
        assert target in result.text_content
-@pytest.mark.skipif(
+def test_markitdown_deprecation() -> None:
-    not GITHUB_TOKEN,
+    try:
-    reason="GitHub token not provided",
+        with catch_warnings(record=True) as w:
-)
+            test_client = object()
-def test_markitdown_github_issue() -> None:
+            markitdown = MarkItDown(mlm_client=test_client)
-    markitdown = MarkItDown()
+            assert len(w) == 1
-    result = markitdown.convert(GITHUB_ISSUE_URL, github_token=GITHUB_TOKEN)
+            assert w[0].category is DeprecationWarning
-    print(result.text_content)
+            assert markitdown._llm_client == test_client
-    assert "User-Defined Functions" in result.text_content
+    finally:
-    assert "closed" in result.text_content
+        resetwarnings()
-    assert "Comments:" in result.text_content
+
    try:
        with catch_warnings(record=True) as w:
            markitdown = MarkItDown(mlm_model="gpt-4o")
            assert len(w) == 1
            assert w[0].category is DeprecationWarning
            assert markitdown._llm_model == "gpt-4o"
    finally:
        resetwarnings()
    try:
        test_client = object()
        markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
        assert False
    except ValueError:
        pass
    try:
        markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
        assert False
    except ValueError:
        pass
@pytest.mark.skipif(
-    not GITHUB_TOKEN,
+    skip_llm,
-    reason="GitHub token not provided",
+    reason="do not run llm tests without a key",
 )
-def test_markitdown_github_pr() -> None:
+def test_markitdown_llm() -> None:
-    markitdown = MarkItDown()
+    client = openai.OpenAI()
-    result = markitdown.convert(GITHUB_PR_URL, github_token=GITHUB_TOKEN)
+    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
-    print(result.text_content)
+
-    assert "faq" in result.text_content
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
    for test_string in LLM_TEST_STRINGS:
        assert test_string in result.text_content
    # This is not super precise. It would also accept "red square", "blue circle",
    # "the square is not blue", etc. But it's sufficient for this test.
    for test_string in ["red", "circle", "blue", "square"]:
        assert test_string in result.text_content.lower()
 if __name__ == "__main__":
@@ -212,5 +303,5 @@ if __name__ == "__main__":
    test_markitdown_remote()
    test_markitdown_local()
    test_markitdown_exiftool()
-    test_markitdown_github_issue()
+    test_markitdown_deprecation()
-    test_markitdown_github_pr()
+    test_markitdown_llm()
Author	SHA1	Message	Date
afourney	3ce21a47ab	Merge pull request #102 from microsoft/bump_version Bump version.	2024-12-17 13:55:12 -08:00
Adam Fourney	9518c01d4e	Bump version.	2024-12-17 13:51:13 -08:00
afourney	22504551ef	Merge pull request #101 from microsoft/add_deprecation_warnings Added deprecation warnings for mlm_* arguments.	2024-12-17 13:49:44 -08:00
Adam Fourney	95188a4a27	Merge main.	2024-12-17 13:46:26 -08:00
afourney	e69d012b86	Merge pull request #100 from microsoft/add_llm_tests	2024-12-17 13:36:36 -08:00
Adam Fourney	03a7843a0a	Added deprecation warnings for mlm_* arguments.	2024-12-17 13:22:48 -08:00
Adam Fourney	248d64edd0	Added llm tests to the local test set.	2024-12-17 12:13:19 -08:00
gagb	ad5d4fb139	Merge pull request #77 from microsoft/kevinclb/main Kevinclb/main	2024-12-16 18:14:09 -08:00
gagb	ad29122592	run precommit	2024-12-16 18:09:48 -08:00
gagb	898bfd4774	Merge branch 'main' into main	2024-12-16 18:00:26 -08:00
gagb	c8980d9f41	Merge pull request #75 from microsoft/cybernobie/main Cybernobie/main	2024-12-16 17:40:13 -08:00
gagb	24b52b2b8f	Improve readme	2024-12-16 17:35:47 -08:00
gagb	09159aa04e	Merge branch 'main' into main	2024-12-16 17:24:47 -08:00
gagb	77f620b568	Merge pull request #67 from DIMAX99/issue#65 fix issue #65	2024-12-16 17:18:53 -08:00
gagb	825d3bbb77	Merge branch 'main' into issue#65	2024-12-16 17:09:53 -08:00
gagb	c0127af120	Merge pull request #72 from CharlesCNorton/patch-1 Fix LLM terms	2024-12-16 17:06:24 -08:00
gagb	33cb5015eb	Merge branch 'main' into patch-1	2024-12-16 17:04:44 -08:00
gagb	cf13b7e657	Merge pull request #73 from CharlesCNorton/patch-2 Fix LLM terminology in code	2024-12-16 17:04:33 -08:00
gagb	874eba6265	Merge branch 'main' into patch-2	2024-12-16 16:59:22 -08:00
gagb	c3fa2934b9	Run pre-commit	2024-12-16 16:56:52 -08:00
gagb	736e7d9a7e	Merge branch 'main' into patch-1	2024-12-16 16:53:58 -08:00
gagb	19c111251b	Merge pull request #60 from madduci/main Added Dockerfile	2024-12-16 16:42:26 -08:00
gagb	360c2dd95f	Merge branch 'main' into main	2024-12-16 16:35:50 -08:00
kevinbabou	87846cf5f8	rm setup.py	2024-12-16 16:28:44 -08:00
kevinbabou	33638f1fe6	feature: add argument parsing and setup.py file for cli tool capability	2024-12-16 16:28:44 -08:00
gagb	73776b2c0f	Merge pull request #50 from narumiruna/youtube-transcript-languages Support specifying YouTube transcript language	2024-12-16 16:23:20 -08:00
gagb	2d3ffeade1	Merge branch 'main' into youtube-transcript-languages	2024-12-16 16:20:35 -08:00
gagb	51c1453699	Merge pull request #48 from Soulter/main Fix: pass the kwargs to _convert method when converting an url file	2024-12-16 16:19:09 -08:00
gagb	ae4669107c	Merge branch 'main' into main	2024-12-16 16:01:59 -08:00
gagb	b0115cf971	Merge branch 'main' into youtube-transcript-languages	2024-12-16 15:47:38 -08:00
gagb	5cf8474f37	Merge pull request #44 from Y-Kim-64/main Exclude test files from language statistics using linguist-vendored	2024-12-16 15:35:19 -08:00
gagb	83dc81170b	Merge branch 'main' into main	2024-12-16 15:29:33 -08:00
gagb	e7a2e20d93	Merge pull request #39 from SH4DOW4RE/main Catching pydub's warning of ffmpeg or avconv missing	2024-12-16 15:28:53 -08:00
gagb	980abd3a60	Merge branch 'main' into main	2024-12-16 15:24:58 -08:00
afourney	6587e0f097	Merge branch 'main' into patch-1	2024-12-16 14:27:26 -08:00
afourney	978c8763aa	Merge pull request #38 from VillePuuska/support-comments-in-docx Add passing style_map kwarg to Mammoth when converting docx to allow keeping comments	2024-12-16 14:26:55 -08:00
afourney	e7636656d8	Merge branch 'main' into support-comments-in-docx	2024-12-16 14:23:14 -08:00
afourney	ddc1bebea4	Merge branch 'main' into patch-2	2024-12-16 14:20:16 -08:00
afourney	fa1f496d51	Merge branch 'main' into patch-1	2024-12-16 14:18:20 -08:00
afourney	da779dd125	Merge pull request #33 from nyosegawa/feature/add-pptx-chart-support Add PPTX chart support	2024-12-16 14:11:49 -08:00
afourney	12ce5e95b2	Merge branch 'main' into feature/add-pptx-chart-support	2024-12-16 14:06:14 -08:00
gagb	6dad1cca96	Merge pull request #22 from Josh-XT/main Add zip handling	2024-12-16 13:56:25 -08:00
gagb	9e6a19987b	Merge branch 'main' into main	2024-12-16 13:51:39 -08:00
gagb	ed91e8b534	Merge pull request #19 from brc-dd/fix/18 Fix character decoding issues with text-like files	2024-12-16 13:49:48 -08:00
gagb	aeff2cb5ae	Merge branch 'main' into fix/18	2024-12-16 13:46:17 -08:00
gagb	c9c7d98d30	Merge pull request #11 from simonw/patch-2 CLI usage instructions	2024-12-16 13:45:05 -08:00
gagb	e7d9b5546a	Merge branch 'main' into patch-2	2024-12-16 13:42:28 -08:00
CharlesCNorton	ed651aeb16	Fix LLM terminology in code Replaced all occurrences of mlm_client and mlm_model with llm_client and llm_model for consistent terminology when referencing Large Language Models (LLMs).	2024-12-16 16:23:52 -05:00
CharlesCNorton	3d9f3f3e5b	Fix LLM terms Updated all instances of mlm_client and mlm_model to llm_client and llm_model in the readme. The previous terms (mlm_client and mlm_model) are incorrect in the context of configuring Large Language Models (LLMs), as "MLM" typically refers to Masked Language Models, which is unrelated to the intended functionality. This change aligns the documentation with standard naming conventions for LLM configuration parameters and improves clarity for users integrating with LLMs like OpenAI's GPT models.	2024-12-16 16:23:03 -05:00
Divit	ad01da308d	fix issue #65	2024-12-16 21:48:33 +05:30
CyberNobie	010f841008	Ensure hatch is installed before running tests	2024-12-16 18:47:24 +05:30
Michele Adduci	5fc03b6415	Added UID as argument	2024-12-16 13:11:13 +01:00
Michele Adduci	013b022427	Added Docker Image for using markitdown in a sandboxed environment	2024-12-16 13:08:15 +01:00
narumi	695100d5d8	Support specifying YouTube transcript language	2024-12-16 13:16:00 +08:00
Soulter	d66ef5fcca	Update README to introduce the customized mlm_prompt	2024-12-16 12:08:51 +08:00
Soulter	c168703d5e	Pass the kwargs to _convert method when converting an url file	2024-12-16 11:41:39 +08:00
Yeonjun	3548c96dd3	Create .gitattributes Mark test files as linguist-vendored	2024-12-16 09:21:07 +09:00
SH4DOW4RE	1559d9d163	pre-commit ran	2024-12-15 22:15:20 +01:00
SH4DOW4RE	b7f5662ffd	PR: Catching pydub's warning of ffmpeg or avconv missing	2024-12-15 17:29:14 +01:00
Ville Puuska	0a7203b876	add style_map prop to MarkItDown class	2024-12-15 17:23:57 +02:00
Ville Puuska	0704b0b6ff	pass 'style_map' kwarg to mammoth when converting docx	2024-12-15 16:59:21 +02:00
sakasegawa	0dd4e95584	Remove _is_chart	2024-12-15 21:14:58 +09:00
sakasegawa	93130b5ba5	Add PPTX chart support	2024-12-15 20:42:55 +09:00
Divyansh Singh	52b723724c	Fix character decoding issues with text-like files	2024-12-15 10:37:59 +05:30
Josh XT	a55c3d525c	Merge branch 'main' into main	2024-12-14 23:09:30 -05:00
gagb	81e3f24acd	Merge pull request #29 from microsoft/gagb-patch-1 Update README.md	2024-12-14 19:17:54 -08:00
gagb	b84294620a	Update README.md	2024-12-14 19:05:51 -08:00
gagb	60c495d609	Merge branch 'main' into patch-2	2024-12-14 18:57:11 -08:00
gagb	71123a4df3	Merge pull request #7 from microsoft/gagb/improve-readme Improve the readme with contributing guidelines	2024-12-14 18:54:28 -08:00
gagb	5753e553fe	Fix conflicts	2024-12-14 18:47:34 -08:00
gagb	752dd897b9	Merge pull request #28 from pawarbi/main Update README.md	2024-12-14 18:44:52 -08:00
gagb	1aa4abe90f	Merge branch 'gagb/improve-readme' into main	2024-12-14 18:44:33 -08:00
gagb	ea7c6dcc40	Merge pull request #27 from haesleinhuepf/patch-1 Add installation instructions from haesleinhuepf:patch-1	2024-12-14 18:39:51 -08:00
gagb	a31c0a13e7	Merge branch 'main' into gagb/improve-readme	2024-12-14 18:34:27 -08:00
Sandeep Pawar	30ab78fe9e	Update README.md I have updated the readme with three changes: - Created sections for Installation and Usage to help users - Added installation instruction - Added additional example of using LLM. This will be the primary use case and will help users.	2024-12-14 19:15:10 -06:00
gagb	559b1fc62a	Merge branch 'main' into patch-2	2024-12-14 15:02:42 -08:00
Josh XT	df03382218	Improve docustring	2024-12-14 17:55:22 -05:00
Robert Haase	18301edcd0	Add installation instructions	2024-12-14 23:22:54 +01:00
Josh XT	4987201ef6	test	2024-12-14 08:49:03 -05:00
Josh XT	571c5bbc0e	add test	2024-12-14 08:45:51 -05:00
Josh XT	e8ea8b6f3d	Update readme	2024-12-14 08:41:07 -05:00
Josh XT	7e634acf5f	import zipfile	2024-12-14 08:24:44 -05:00
Josh XT	862c39029e	add zip handling	2024-12-14 06:34:47 -05:00
Simon Willison	33ce17954d	Note about piping	2024-12-13 11:09:03 -08:00
Simon Willison	6ebef5af0c	CLI usage instructions Plus added a PyPI badge	2024-12-13 11:06:11 -08:00
gagb	3f9ba06418	Improve the readme with contributing guidelines Addresses issue https://github.com/microsoft/markitdown/issues/6 --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/microsoft/markitdown?shareId=XXXX-XXXX-XXXX-XXXX).	2024-12-12 15:17:18 -08:00