diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d2f31ef --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/test_files/** linguist-vendored diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..492ad8a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.13-alpine + +USER root + +# Runtime dependency +RUN apk add --no-cache ffmpeg + +RUN pip install markitdown + +# Default USERID and GROUPID +ARG USERID=10000 +ARG GROUPID=10000 + +USER $USERID:$GROUPID + +ENTRYPOINT [ "markitdown" ] diff --git a/README.md b/README.md index dba35b6..8aa5e2a 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,8 @@ You can pipe content to standard input by omitting the argument: cat path-to-file.pdf | markitdown ``` -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide llm_client and llm_model parameters to MarkItDown object, according to your specific client. +You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client. + ```python from markitdown import MarkItDown @@ -71,6 +72,13 @@ result = md.convert("example.jpg") print(result.text_content) ``` +You can also use the project as Docker Image: + +```sh +docker build -t markitdown:latest . +docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 4ee1990..cfbb243 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -15,6 +15,7 @@ import traceback import zipfile from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse +from warnings import catch_warnings import mammoth import markdownify @@ -31,7 +32,13 @@ from charset_normalizer import from_path # Optional Transcription support try: - import pydub + # Using warnings' catch_warnings to catch + # pydub's warning of ffmpeg or avconv missing + with catch_warnings(record=True) as w: + import pydub + + if w: + raise ModuleNotFoundError import speech_recognition as sr IS_AUDIO_TRANSCRIPTION_CAPABLE = True @@ -344,8 +351,11 @@ class YouTubeConverter(DocumentConverter): assert isinstance(params["v"][0], str) video_id = str(params["v"][0]) try: + youtube_transcript_languages = kwargs.get( + "youtube_transcript_languages", ("en",) + ) # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore + transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: # formatter = TextFormatter() @@ -1153,7 +1163,7 @@ class MarkItDown: self._append_ext(extensions, g) # Convert - result = self._convert(temp_path, extensions, url=response.url) + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: