Merge branch 'microsoft:main' into main

2024-12-17 10:33:40 +05:30
parent 3eb8cf385b ad5d4fb139
commit 60c4a62917
5 changed files with 74 additions and 39 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1 @@
+*
--- a/16
+++ b/16
@@ -0,0 +1,16 @@
+FROM python:3.13-alpine
+
+USER root
+
+# Runtime dependency
+RUN apk add --no-cache ffmpeg
+
+RUN pip install markitdown
+
+# Default USERID and GROUPID
+ARG USERID=10000
+ARG GROUPID=10000
+
+USER $USERID:$GROUPID
+
+ENTRYPOINT [ "markitdown" ]
--- a/README.md
+++ b/README.md
@@ -59,19 +59,26 @@ You can pipe content to standard input by omitting the argument:
 cat path-to-file.pdf | markitdown
 ```

+You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client.

-You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client.

 ```python
 from markitdown import MarkItDown
 from openai import OpenAI

 client = OpenAI()
-md = MarkItDown(mlm_client=client, mlm_model="gpt-4o")
+md = MarkItDown(llm_client=client, llm_model="gpt-4o")
 result = md.convert("example.jpg")
 print(result.text_content)
 ```

+You can also use the project as Docker Image:
+
+```sh
+docker build -t markitdown:latest .
+docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
+```
+
 ## Contributing

 This project welcomes contributions and suggestions.  Most contributions require you to agree to a
@@ -88,15 +95,18 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio

 ### Running Tests

-To run the tests for this project, use the following command:
+To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install).

 ```sh
+pip install hatch
 hatch shell
 hatch test
 ```

 ### Running Pre-commit Checks

+Please run the pre-commit checks before submitting a PR.
+
 ```sh
 pre-commit run --all-files
 ```
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -2,21 +2,15 @@
 #
 # SPDX-License-Identifier: MIT
 import sys
+import argparse
 from ._markitdown import MarkItDown


 def main():
-    if len(sys.argv) == 1:
-        markitdown = MarkItDown()
-        result = markitdown.convert_stream(sys.stdin.buffer)
-        print(result.text_content)
-    elif len(sys.argv) == 2:
-        markitdown = MarkItDown()
-        result = markitdown.convert(sys.argv[1])
-        print(result.text_content)
-    else:
-        sys.stderr.write(
-            """
+    parser = argparse.ArgumentParser(
+        description="Convert various file formats to markdown.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage="""
 SYNTAX: 
    
    markitdown <OPTIONAL: FILENAME>
@@ -33,10 +27,21 @@ EXAMPLE:
    OR 

    markitdown < example.pdf
-""".strip()
-            + "\n"
+""".strip(),
    )

+    parser.add_argument("filename", nargs="?")
+    args = parser.parse_args()
+
+    if args.filename is None:
+        markitdown = MarkItDown()
+        result = markitdown.convert_stream(sys.stdin.buffer)
+        print(result.text_content)
+    else:
+        markitdown = MarkItDown()
+        result = markitdown.convert(args.filename)
+        print(result.text_content)
+

 if __name__ == "__main__":
    main()
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -351,8 +351,11 @@ class YouTubeConverter(DocumentConverter):
                assert isinstance(params["v"][0], str)
                video_id = str(params["v"][0])
                try:
+                    youtube_transcript_languages = kwargs.get(
+                        "youtube_transcript_languages", ("en",)
+                    )
                    # Must be a single transcript.
-                    transcript = YouTubeTranscriptApi.get_transcript(video_id)  # type: ignore
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
@@ -851,7 +854,7 @@ class Mp3Converter(WavConverter):

 class ImageConverter(MediaConverter):
    """
-    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
    """

    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
@@ -881,17 +884,17 @@ class ImageConverter(MediaConverter):
                    md_content += f"{f}: {metadata[f]}\n"

        # Try describing the image with GPTV
-        mlm_client = kwargs.get("mlm_client")
-        mlm_model = kwargs.get("mlm_model")
-        if mlm_client is not None and mlm_model is not None:
+        llm_client = kwargs.get("llm_client")
+        llm_model = kwargs.get("llm_model")
+        if llm_client is not None and llm_model is not None:
            md_content += (
                "\n# Description:\n"
-                + self._get_mlm_description(
+                + self._get_llm_description(
                    local_path,
                    extension,
-                    mlm_client,
-                    mlm_model,
-                    prompt=kwargs.get("mlm_prompt"),
+                    llm_client,
+                    llm_model,
+                    prompt=kwargs.get("llm_prompt"),
                ).strip()
                + "\n"
            )
@@ -901,11 +904,11 @@ class ImageConverter(MediaConverter):
            text_content=md_content,
        )

-    def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
+    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
        if prompt is None or prompt.strip() == "":
            prompt = "Write a detailed caption for this image."

-        sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
+        sys.stderr.write(f"llm Prompt:\n{prompt}\n")

        data_uri = ""
        with open(local_path, "rb") as image_file:
@@ -1067,8 +1070,8 @@ class MarkItDown:
    def __init__(
        self,
        requests_session: Optional[requests.Session] = None,
-        mlm_client: Optional[Any] = None,
-        mlm_model: Optional[Any] = None,
+        llm_client: Optional[Any] = None,
+        llm_model: Optional[Any] = None,
        style_map: Optional[str] = None,
    ):
        if requests_session is None:
@@ -1076,8 +1079,8 @@ class MarkItDown:
        else:
            self._requests_session = requests_session

-        self._mlm_client = mlm_client
-        self._mlm_model = mlm_model
+        self._llm_client = llm_client
+        self._llm_model = llm_model
        self._style_map = style_map

        self._page_converters: List[DocumentConverter] = []
@@ -1222,7 +1225,7 @@ class MarkItDown:
                self._append_ext(extensions, g)

            # Convert
-            result = self._convert(temp_path, extensions, url=response.url)
+            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
        # Clean up
        finally:
            try:
@@ -1249,11 +1252,12 @@ class MarkItDown:
                    _kwargs.update({"file_extension": ext})

                # Copy any additional global options
-                if "mlm_client" not in _kwargs and self._mlm_client is not None:
-                    _kwargs["mlm_client"] = self._mlm_client
+                if "llm_client" not in _kwargs and self._llm_client is not None:
+                    _kwargs["llm_client"] = self._llm_client
+
+                if "llm_model" not in _kwargs and self._llm_model is not None:
+                    _kwargs["llm_model"] = self._llm_model

-                if "mlm_model" not in _kwargs and self._mlm_model is not None:
-                    _kwargs["mlm_model"] = self._mlm_model
                # Add the list of converters for nested processing
                _kwargs["_parent_converters"] = self._page_converters

@@ -1295,7 +1299,6 @@ class MarkItDown:
        if ext == "":
            return
        # if ext not in extensions:
-        if True:
        extensions.append(ext)

    def _guess_ext_magic(self, path):