From c168703d5e32d1e920945c1e3dda3a0c0f76eb27 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Mon, 16 Dec 2024 11:41:39 +0800 Subject: [PATCH 01/14] Pass the kwargs to _convert method when converting an url file --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..0866dac 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1003,7 +1003,7 @@ class MarkItDown: self._append_ext(extensions, g) # Convert - result = self._convert(temp_path, extensions, url=response.url) + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: From d66ef5fccaa49b9a1b95e5e043ebc44f8a88f5a4 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Mon, 16 Dec 2024 12:08:51 +0800 Subject: [PATCH 02/14] Update README to introduce the customized mlm_prompt --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index df7189d..3eac77b 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. +You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `mlm_client` and `mlm_model` parameters to MarkItDown object, according to your specific client. ```python from markitdown import MarkItDown @@ -51,6 +51,13 @@ result = md.convert("example.jpg") print(result.text_content) ``` +The prompt of describing images can be customized by providing `mlm_prompt` parameter. + +```python +# ... +result = md.convert("example.jpg", mlm_prompt="Customized prompt") +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 695100d5d83e62be9810216756d9e7012b22a12e Mon Sep 17 00:00:00 2001 From: narumi Date: Mon, 16 Dec 2024 13:16:00 +0800 Subject: [PATCH 03/14] Support specifying YouTube transcript language --- src/markitdown/_markitdown.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..645d230 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -344,8 +344,11 @@ class YouTubeConverter(DocumentConverter): assert isinstance(params["v"][0], str) video_id = str(params["v"][0]) try: + youtube_transcript_languages = kwargs.get( + "youtube_transcript_languages", ("en",) + ) # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore + transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: # formatter = TextFormatter() @@ -1003,7 +1006,7 @@ class MarkItDown: self._append_ext(extensions, g) # Convert - result = self._convert(temp_path, extensions, url=response.url) + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: From 013b022427d274d4b61ed0d6bef9cefe59ccb375 Mon Sep 17 00:00:00 2001 From: Michele Adduci Date: Mon, 16 Dec 2024 13:08:15 +0100 Subject: [PATCH 04/14] Added Docker Image for using markitdown in a sandboxed environment --- .dockerignore | 1 + Dockerfile | 12 ++++++++++++ README.md | 7 +++++++ 3 files changed, 20 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3dd6100 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.13-alpine + +USER root + +# Runtime dependency +RUN apk add --no-cache ffmpeg + +RUN pip install markitdown + +USER 10000:10000 + +ENTRYPOINT [ "markitdown" ] diff --git a/README.md b/README.md index df7189d..50b5a80 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,13 @@ result = md.convert("example.jpg") print(result.text_content) ``` +You can also use the project as Docker Image: + +```sh +docker build -t markitdown:latest . +docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 5fc03b64152ca9021b537e57a845701d5506a10e Mon Sep 17 00:00:00 2001 From: Michele Adduci Date: Mon, 16 Dec 2024 13:11:13 +0100 Subject: [PATCH 05/14] Added UID as argument --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3dd6100..492ad8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,10 @@ RUN apk add --no-cache ffmpeg RUN pip install markitdown -USER 10000:10000 +# Default USERID and GROUPID +ARG USERID=10000 +ARG GROUPID=10000 + +USER $USERID:$GROUPID ENTRYPOINT [ "markitdown" ] From 010f841008135f8b89e4bc6cceee793fceaa87fe Mon Sep 17 00:00:00 2001 From: CyberNobie Date: Mon, 16 Dec 2024 18:47:24 +0530 Subject: [PATCH 06/14] Ensure hatch is installed before running tests --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index df7189d..ca8bd03 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio To run the tests for this project, use the following command: ```sh +pip install hatch hatch shell hatch test ``` From ad01da308dfe3b4a0e7d0f7c23224d16c9c8cf95 Mon Sep 17 00:00:00 2001 From: Divit <56664482+DIMAX99@users.noreply.github.com> Date: Mon, 16 Dec 2024 21:48:33 +0530 Subject: [PATCH 07/14] fix issue #65 --- src/markitdown/_markitdown.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..80d9c30 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1071,8 +1071,7 @@ class MarkItDown: if ext == "": return # if ext not in extensions: - if True: - extensions.append(ext) + extensions.append(ext) def _guess_ext_magic(self, path): """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" From 3d9f3f3e5bd1d519f806de4e84dd305a1cf581d0 Mon Sep 17 00:00:00 2001 From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:23:03 -0500 Subject: [PATCH 08/14] Fix LLM terms Updated all instances of mlm_client and mlm_model to llm_client and llm_model in the readme. The previous terms (mlm_client and mlm_model) are incorrect in the context of configuring Large Language Models (LLMs), as "MLM" typically refers to Masked Language Models, which is unrelated to the intended functionality. This change aligns the documentation with standard naming conventions for LLM configuration parameters and improves clarity for users integrating with LLMs like OpenAI's GPT models. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index df7189d..c5767ba 100644 --- a/README.md +++ b/README.md @@ -39,14 +39,14 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. +You can also configure markitdown to use Large Language Models to describe images. To do so you must provide llm_client and llm_model parameters to MarkItDown object, according to your specific client. ```python from markitdown import MarkItDown from openai import OpenAI client = OpenAI() -md = MarkItDown(mlm_client=client, mlm_model="gpt-4o") +md = MarkItDown(llm_client=client, llm_model="gpt-4o") result = md.convert("example.jpg") print(result.text_content) ``` From ed651aeb16bb858c6aead0a4b091a548c67d9e17 Mon Sep 17 00:00:00 2001 From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:23:52 -0500 Subject: [PATCH 09/14] Fix LLM terminology in code Replaced all occurrences of mlm_client and mlm_model with llm_client and llm_model for consistent terminology when referencing Large Language Models (LLMs). --- src/markitdown/_markitdown.py | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..f46dd3f 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -754,7 +754,7 @@ class Mp3Converter(WavConverter): class ImageConverter(MediaConverter): """ - Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). + Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: @@ -784,17 +784,17 @@ class ImageConverter(MediaConverter): md_content += f"{f}: {metadata[f]}\n" # Try describing the image with GPTV - mlm_client = kwargs.get("mlm_client") - mlm_model = kwargs.get("mlm_model") - if mlm_client is not None and mlm_model is not None: + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: md_content += ( "\n# Description:\n" - + self._get_mlm_description( + + self._get_llm_description( local_path, extension, - mlm_client, - mlm_model, - prompt=kwargs.get("mlm_prompt"), + llm_client, + llm_model, + prompt=kwargs.get("llm_prompt"), ).strip() + "\n" ) @@ -804,11 +804,11 @@ class ImageConverter(MediaConverter): text_content=md_content, ) - def _get_mlm_description(self, local_path, extension, client, model, prompt=None): + def _get_llm_description(self, local_path, extension, client, model, prompt=None): if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." - sys.stderr.write(f"MLM Prompt:\n{prompt}\n") + sys.stderr.write(f"llm Prompt:\n{prompt}\n") data_uri = "" with open(local_path, "rb") as image_file: @@ -852,16 +852,16 @@ class MarkItDown: def __init__( self, requests_session: Optional[requests.Session] = None, - mlm_client: Optional[Any] = None, - mlm_model: Optional[Any] = None, + llm_client: Optional[Any] = None, + llm_model: Optional[Any] = None, ): if requests_session is None: self._requests_session = requests.Session() else: self._requests_session = requests_session - self._mlm_client = mlm_client - self._mlm_model = mlm_model + self._llm_client = llm_client + self._llm_model = llm_model self._page_converters: List[DocumentConverter] = [] @@ -1030,11 +1030,11 @@ class MarkItDown: _kwargs.update({"file_extension": ext}) # Copy any additional global options - if "mlm_client" not in _kwargs and self._mlm_client is not None: - _kwargs["mlm_client"] = self._mlm_client + if "llm_client" not in _kwargs and self._llm_client is not None: + _kwargs["llm_client"] = self._llm_client - if "mlm_model" not in _kwargs and self._mlm_model is not None: - _kwargs["mlm_model"] = self._mlm_model + if "llm_model" not in _kwargs and self._llm_model is not None: + _kwargs["llm_model"] = self._llm_model # If we hit an error log it and keep trying try: From 33638f1fe6d380a888f7d9bf5119dba965fba3c3 Mon Sep 17 00:00:00 2001 From: kevinbabou Date: Sun, 15 Dec 2024 17:38:28 -0800 Subject: [PATCH 10/14] feature: add argument parsing and setup.py file for cli tool capability --- setup.py | 31 +++++++++++++++++++++++++++++++ src/markitdown/__main__.py | 33 +++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 14 deletions(-) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7d3e311 --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +from setuptools import setup, find_packages + +setup( + name='markitdown', + version='0.1.0', + package_dir={'': 'src'}, + packages=find_packages(where='src'), + install_requires=[ + 'mammoth', + 'markdownify', + 'pandas', + 'pdfminer.six', + 'python-pptx', + 'puremagic', + 'requests', + 'beautifulsoup4', + 'pydub', + 'SpeechRecognition', + 'youtube_transcript_api', + ], + entry_points={ + 'console_scripts': [ + 'markitdown=markitdown.__main__:main', + ], + }, + author='Adam Fourney', + author_email='adamfo@microsoft.com', + description='Convert various file formats to markdown', + license='MIT', + python_requires='>=3.6', +) \ No newline at end of file diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 6c8a672..a2fafb2 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -2,21 +2,15 @@ # # SPDX-License-Identifier: MIT import sys +import argparse from ._markitdown import MarkItDown def main(): - if len(sys.argv) == 1: - markitdown = MarkItDown() - result = markitdown.convert_stream(sys.stdin.buffer) - print(result.text_content) - elif len(sys.argv) == 2: - markitdown = MarkItDown() - result = markitdown.convert(sys.argv[1]) - print(result.text_content) - else: - sys.stderr.write( - """ + parser = argparse.ArgumentParser( + description='Convert various file formats to markdown.', + formatter_class=argparse.RawDescriptionHelpFormatter, + usage=""" SYNTAX: markitdown @@ -34,9 +28,20 @@ EXAMPLE: markitdown < example.pdf """.strip() - + "\n" - ) + ) + + parser.add_argument('filename', nargs='?') + args = parser.parse_args() + + if args.filename is None: + markitdown = MarkItDown() + result = markitdown.convert_stream(sys.stdin.buffer) + print(result.text_content) + else: + markitdown = MarkItDown() + result = markitdown.convert(args.filename) + print(result.text_content) if __name__ == "__main__": - main() + main() \ No newline at end of file From 87846cf5f8797dcc54d2c34d1ac56bb666d99d24 Mon Sep 17 00:00:00 2001 From: kevinbabou Date: Mon, 16 Dec 2024 16:27:59 -0800 Subject: [PATCH 11/14] rm setup.py --- setup.py | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 setup.py diff --git a/setup.py b/setup.py deleted file mode 100644 index 7d3e311..0000000 --- a/setup.py +++ /dev/null @@ -1,31 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name='markitdown', - version='0.1.0', - package_dir={'': 'src'}, - packages=find_packages(where='src'), - install_requires=[ - 'mammoth', - 'markdownify', - 'pandas', - 'pdfminer.six', - 'python-pptx', - 'puremagic', - 'requests', - 'beautifulsoup4', - 'pydub', - 'SpeechRecognition', - 'youtube_transcript_api', - ], - entry_points={ - 'console_scripts': [ - 'markitdown=markitdown.__main__:main', - ], - }, - author='Adam Fourney', - author_email='adamfo@microsoft.com', - description='Convert various file formats to markdown', - license='MIT', - python_requires='>=3.6', -) \ No newline at end of file From c3fa2934b99f80a95e0a712a7d431863bd7cb04e Mon Sep 17 00:00:00 2001 From: gagb Date: Mon, 16 Dec 2024 16:56:52 -0800 Subject: [PATCH 12/14] Run pre-commit --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 9cbd1db..f6eb6b2 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1181,7 +1181,7 @@ class MarkItDown: if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model - + # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._page_converters From 24b52b2b8f824e0721c548892beedf5d0a285518 Mon Sep 17 00:00:00 2001 From: gagb Date: Mon, 16 Dec 2024 17:35:47 -0800 Subject: [PATCH 13/14] Improve readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0232935..7079dbf 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio ### Running Tests -To run the tests for this project, use the following command: +To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install). ```sh pip install hatch @@ -105,6 +105,8 @@ hatch test ### Running Pre-commit Checks +Please run the pre-commit checks before submitting a PR. + ```sh pre-commit run --all-files ``` From ad29122592ec44c68c571ef991a9a6082e9fef36 Mon Sep 17 00:00:00 2001 From: gagb Date: Mon, 16 Dec 2024 18:09:48 -0800 Subject: [PATCH 14/14] run precommit --- src/markitdown/__main__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index a2fafb2..2d53173 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -8,7 +8,7 @@ from ._markitdown import MarkItDown def main(): parser = argparse.ArgumentParser( - description='Convert various file formats to markdown.', + description="Convert various file formats to markdown.", formatter_class=argparse.RawDescriptionHelpFormatter, usage=""" SYNTAX: @@ -27,10 +27,10 @@ EXAMPLE: OR markitdown < example.pdf -""".strip() +""".strip(), ) - parser.add_argument('filename', nargs='?') + parser.add_argument("filename", nargs="?") args = parser.parse_args() if args.filename is None: @@ -44,4 +44,4 @@ EXAMPLE: if __name__ == "__main__": - main() \ No newline at end of file + main()