From b7f5662ffdebdbcf0d7ccacffc9c1bdadc9f58d0 Mon Sep 17 00:00:00 2001 From: SH4DOW4RE Date: Sun, 15 Dec 2024 17:29:14 +0100 Subject: [PATCH 1/8] PR: Catching pydub's warning of ffmpeg or avconv missing --- src/markitdown/_markitdown.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..0d77091 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -14,6 +14,7 @@ import tempfile import traceback from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse +from warnings import catch_warnings import mammoth import markdownify @@ -29,7 +30,12 @@ from bs4 import BeautifulSoup # Optional Transcription support try: - import pydub + # Using warnings' catch_warnings to catch + # pydub's warning of ffmpeg or avconv missing + with catch_warnings(record=True) as w: + import pydub + if w: + raise ModuleNotFoundError import speech_recognition as sr IS_AUDIO_TRANSCRIPTION_CAPABLE = True From 1559d9d163de44ff4a41f261a7303124057a908e Mon Sep 17 00:00:00 2001 From: SH4DOW4RE Date: Sun, 15 Dec 2024 22:15:20 +0100 Subject: [PATCH 2/8] pre-commit ran --- src/markitdown/_markitdown.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 0d77091..5789679 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -34,6 +34,7 @@ try: # pydub's warning of ffmpeg or avconv missing with catch_warnings(record=True) as w: import pydub + if w: raise ModuleNotFoundError import speech_recognition as sr From 3548c96dd36836b148d4d376e4c55fdeba5a6343 Mon Sep 17 00:00:00 2001 From: Yeonjun Date: Mon, 16 Dec 2024 09:21:07 +0900 Subject: [PATCH 3/8] Create .gitattributes Mark test files as linguist-vendored --- .gitattributes | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d2f31ef --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/test_files/** linguist-vendored From c168703d5e32d1e920945c1e3dda3a0c0f76eb27 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Mon, 16 Dec 2024 11:41:39 +0800 Subject: [PATCH 4/8] Pass the kwargs to _convert method when converting an url file --- src/markitdown/_markitdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..0866dac 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1003,7 +1003,7 @@ class MarkItDown: self._append_ext(extensions, g) # Convert - result = self._convert(temp_path, extensions, url=response.url) + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: From d66ef5fccaa49b9a1b95e5e043ebc44f8a88f5a4 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Mon, 16 Dec 2024 12:08:51 +0800 Subject: [PATCH 5/8] Update README to introduce the customized mlm_prompt --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index df7189d..3eac77b 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. +You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `mlm_client` and `mlm_model` parameters to MarkItDown object, according to your specific client. ```python from markitdown import MarkItDown @@ -51,6 +51,13 @@ result = md.convert("example.jpg") print(result.text_content) ``` +The prompt of describing images can be customized by providing `mlm_prompt` parameter. + +```python +# ... +result = md.convert("example.jpg", mlm_prompt="Customized prompt") +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 695100d5d83e62be9810216756d9e7012b22a12e Mon Sep 17 00:00:00 2001 From: narumi Date: Mon, 16 Dec 2024 13:16:00 +0800 Subject: [PATCH 6/8] Support specifying YouTube transcript language --- src/markitdown/_markitdown.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..645d230 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -344,8 +344,11 @@ class YouTubeConverter(DocumentConverter): assert isinstance(params["v"][0], str) video_id = str(params["v"][0]) try: + youtube_transcript_languages = kwargs.get( + "youtube_transcript_languages", ("en",) + ) # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore + transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: # formatter = TextFormatter() @@ -1003,7 +1006,7 @@ class MarkItDown: self._append_ext(extensions, g) # Convert - result = self._convert(temp_path, extensions, url=response.url) + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: From 013b022427d274d4b61ed0d6bef9cefe59ccb375 Mon Sep 17 00:00:00 2001 From: Michele Adduci Date: Mon, 16 Dec 2024 13:08:15 +0100 Subject: [PATCH 7/8] Added Docker Image for using markitdown in a sandboxed environment --- .dockerignore | 1 + Dockerfile | 12 ++++++++++++ README.md | 7 +++++++ 3 files changed, 20 insertions(+) create mode 100644 .dockerignore create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3dd6100 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.13-alpine + +USER root + +# Runtime dependency +RUN apk add --no-cache ffmpeg + +RUN pip install markitdown + +USER 10000:10000 + +ENTRYPOINT [ "markitdown" ] diff --git a/README.md b/README.md index df7189d..50b5a80 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,13 @@ result = md.convert("example.jpg") print(result.text_content) ``` +You can also use the project as Docker Image: + +```sh +docker build -t markitdown:latest . +docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From 5fc03b64152ca9021b537e57a845701d5506a10e Mon Sep 17 00:00:00 2001 From: Michele Adduci Date: Mon, 16 Dec 2024 13:11:13 +0100 Subject: [PATCH 8/8] Added UID as argument --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3dd6100..492ad8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,10 @@ RUN apk add --no-cache ffmpeg RUN pip install markitdown -USER 10000:10000 +# Default USERID and GROUPID +ARG USERID=10000 +ARG GROUPID=10000 + +USER $USERID:$GROUPID ENTRYPOINT [ "markitdown" ]