From 013b022427d274d4b61ed0d6bef9cefe59ccb375 Mon Sep 17 00:00:00 2001 From: Michele Adduci Date: Mon, 16 Dec 2024 13:08:15 +0100 Subject: [PATCH 1/8] Added Docker Image for using markitdown in a sandboxed environment --- .dockerignore | 1 + Dockerfile | 12 ++++++++++++ README.md | 7 +++++++ 3 files changed, 20 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3dd6100 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.13-alpine + +USER root + +# Runtime dependency +RUN apk add --no-cache ffmpeg + +RUN pip install markitdown + +USER 10000:10000 + +ENTRYPOINT [ "markitdown" ] diff --git a/README.md b/README.md index df7189d..50b5a80 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,13 @@ result = md.convert("example.jpg") print(result.text_content) ``` +You can also use the project as Docker Image: + +```sh +docker build -t markitdown:latest . +docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 5fc03b64152ca9021b537e57a845701d5506a10e Mon Sep 17 00:00:00 2001 From: Michele Adduci Date: Mon, 16 Dec 2024 13:11:13 +0100 Subject: [PATCH 2/8] Added UID as argument --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3dd6100..492ad8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,10 @@ RUN apk add --no-cache ffmpeg RUN pip install markitdown -USER 10000:10000 +# Default USERID and GROUPID +ARG USERID=10000 +ARG GROUPID=10000 + +USER $USERID:$GROUPID ENTRYPOINT [ "markitdown" ] From 010f841008135f8b89e4bc6cceee793fceaa87fe Mon Sep 17 00:00:00 2001 From: CyberNobie Date: Mon, 16 Dec 2024 18:47:24 +0530 Subject: [PATCH 3/8] Ensure hatch is installed before running tests --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index df7189d..ca8bd03 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio To run the tests for this project, use the following command: ```sh +pip install hatch hatch shell hatch test ``` From ad01da308dfe3b4a0e7d0f7c23224d16c9c8cf95 Mon Sep 17 00:00:00 2001 From: Divit <56664482+DIMAX99@users.noreply.github.com> Date: Mon, 16 Dec 2024 21:48:33 +0530 Subject: [PATCH 4/8] fix issue #65 --- src/markitdown/_markitdown.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..80d9c30 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1071,8 +1071,7 @@ class MarkItDown: if ext == "": return # if ext not in extensions: - if True: - extensions.append(ext) + extensions.append(ext) def _guess_ext_magic(self, path): """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" From 3d9f3f3e5bd1d519f806de4e84dd305a1cf581d0 Mon Sep 17 00:00:00 2001 From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:23:03 -0500 Subject: [PATCH 5/8] Fix LLM terms Updated all instances of mlm_client and mlm_model to llm_client and llm_model in the readme. The previous terms (mlm_client and mlm_model) are incorrect in the context of configuring Large Language Models (LLMs), as "MLM" typically refers to Masked Language Models, which is unrelated to the intended functionality. This change aligns the documentation with standard naming conventions for LLM configuration parameters and improves clarity for users integrating with LLMs like OpenAI's GPT models. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index df7189d..c5767ba 100644 --- a/README.md +++ b/README.md @@ -39,14 +39,14 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. +You can also configure markitdown to use Large Language Models to describe images. To do so you must provide llm_client and llm_model parameters to MarkItDown object, according to your specific client. ```python from markitdown import MarkItDown from openai import OpenAI client = OpenAI() -md = MarkItDown(mlm_client=client, mlm_model="gpt-4o") +md = MarkItDown(llm_client=client, llm_model="gpt-4o") result = md.convert("example.jpg") print(result.text_content) ``` From ed651aeb16bb858c6aead0a4b091a548c67d9e17 Mon Sep 17 00:00:00 2001 From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com> Date: Mon, 16 Dec 2024 16:23:52 -0500 Subject: [PATCH 6/8] Fix LLM terminology in code Replaced all occurrences of mlm_client and mlm_model with llm_client and llm_model for consistent terminology when referencing Large Language Models (LLMs). --- src/markitdown/_markitdown.py | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..f46dd3f 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -754,7 +754,7 @@ class Mp3Converter(WavConverter): class ImageConverter(MediaConverter): """ - Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). + Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: @@ -784,17 +784,17 @@ class ImageConverter(MediaConverter): md_content += f"{f}: {metadata[f]}\n" # Try describing the image with GPTV - mlm_client = kwargs.get("mlm_client") - mlm_model = kwargs.get("mlm_model") - if mlm_client is not None and mlm_model is not None: + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: md_content += ( "\n# Description:\n" - + self._get_mlm_description( + + self._get_llm_description( local_path, extension, - mlm_client, - mlm_model, - prompt=kwargs.get("mlm_prompt"), + llm_client, + llm_model, + prompt=kwargs.get("llm_prompt"), ).strip() + "\n" ) @@ -804,11 +804,11 @@ class ImageConverter(MediaConverter): text_content=md_content, ) - def _get_mlm_description(self, local_path, extension, client, model, prompt=None): + def _get_llm_description(self, local_path, extension, client, model, prompt=None): if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." - sys.stderr.write(f"MLM Prompt:\n{prompt}\n") + sys.stderr.write(f"llm Prompt:\n{prompt}\n") data_uri = "" with open(local_path, "rb") as image_file: @@ -852,16 +852,16 @@ class MarkItDown: def __init__( self, requests_session: Optional[requests.Session] = None, - mlm_client: Optional[Any] = None, - mlm_model: Optional[Any] = None, + llm_client: Optional[Any] = None, + llm_model: Optional[Any] = None, ): if requests_session is None: self._requests_session = requests.Session() else: self._requests_session = requests_session - self._mlm_client = mlm_client - self._mlm_model = mlm_model + self._llm_client = llm_client + self._llm_model = llm_model self._page_converters: List[DocumentConverter] = [] @@ -1030,11 +1030,11 @@ class MarkItDown: _kwargs.update({"file_extension": ext}) # Copy any additional global options - if "mlm_client" not in _kwargs and self._mlm_client is not None: - _kwargs["mlm_client"] = self._mlm_client + if "llm_client" not in _kwargs and self._llm_client is not None: + _kwargs["llm_client"] = self._llm_client - if "mlm_model" not in _kwargs and self._mlm_model is not None: - _kwargs["mlm_model"] = self._mlm_model + if "llm_model" not in _kwargs and self._llm_model is not None: + _kwargs["llm_model"] = self._llm_model # If we hit an error log it and keep trying try: From c3fa2934b99f80a95e0a712a7d431863bd7cb04e Mon Sep 17 00:00:00 2001 From: gagb Date: Mon, 16 Dec 2024 16:56:52 -0800 Subject: [PATCH 7/8] Run pre-commit --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 9cbd1db..f6eb6b2 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1181,7 +1181,7 @@ class MarkItDown: if "llm_model" not in _kwargs and self._llm_model is not None: _kwargs["llm_model"] = self._llm_model - + # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._page_converters From 24b52b2b8f824e0721c548892beedf5d0a285518 Mon Sep 17 00:00:00 2001 From: gagb Date: Mon, 16 Dec 2024 17:35:47 -0800 Subject: [PATCH 8/8] Improve readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0232935..7079dbf 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio ### Running Tests -To run the tests for this project, use the following command: +To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install). ```sh pip install hatch @@ -105,6 +105,8 @@ hatch test ### Running Pre-commit Checks +Please run the pre-commit checks before submitting a PR. + ```sh pre-commit run --all-files ```