diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..492ad8a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.13-alpine + +USER root + +# Runtime dependency +RUN apk add --no-cache ffmpeg + +RUN pip install markitdown + +# Default USERID and GROUPID +ARG USERID=10000 +ARG GROUPID=10000 + +USER $USERID:$GROUPID + +ENTRYPOINT [ "markitdown" ] diff --git a/README.md b/README.md index f0f2e83..7079dbf 100644 --- a/README.md +++ b/README.md @@ -59,19 +59,26 @@ You can pipe content to standard input by omitting the argument: cat path-to-file.pdf | markitdown ``` +You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client. -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. ```python from markitdown import MarkItDown from openai import OpenAI client = OpenAI() -md = MarkItDown(mlm_client=client, mlm_model="gpt-4o") +md = MarkItDown(llm_client=client, llm_model="gpt-4o") result = md.convert("example.jpg") print(result.text_content) ``` +You can also use the project as Docker Image: + +```sh +docker build -t markitdown:latest . +docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md +``` + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a @@ -88,15 +95,18 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio ### Running Tests -To run the tests for this project, use the following command: +To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install). ```sh +pip install hatch hatch shell hatch test ``` ### Running Pre-commit Checks +Please run the pre-commit checks before submitting a PR. + ```sh pre-commit run --all-files ``` diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 6c8a672..2d53173 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -2,21 +2,15 @@ # # SPDX-License-Identifier: MIT import sys +import argparse from ._markitdown import MarkItDown def main(): - if len(sys.argv) == 1: - markitdown = MarkItDown() - result = markitdown.convert_stream(sys.stdin.buffer) - print(result.text_content) - elif len(sys.argv) == 2: - markitdown = MarkItDown() - result = markitdown.convert(sys.argv[1]) - print(result.text_content) - else: - sys.stderr.write( - """ + parser = argparse.ArgumentParser( + description="Convert various file formats to markdown.", + formatter_class=argparse.RawDescriptionHelpFormatter, + usage=""" SYNTAX: markitdown @@ -33,9 +27,20 @@ EXAMPLE: OR markitdown < example.pdf -""".strip() - + "\n" - ) +""".strip(), + ) + + parser.add_argument("filename", nargs="?") + args = parser.parse_args() + + if args.filename is None: + markitdown = MarkItDown() + result = markitdown.convert_stream(sys.stdin.buffer) + print(result.text_content) + else: + markitdown = MarkItDown() + result = markitdown.convert(args.filename) + print(result.text_content) if __name__ == "__main__": diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 693e1cd..3937cbc 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -351,8 +351,11 @@ class YouTubeConverter(DocumentConverter): assert isinstance(params["v"][0], str) video_id = str(params["v"][0]) try: + youtube_transcript_languages = kwargs.get( + "youtube_transcript_languages", ("en",) + ) # Must be a single transcript. - transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore + transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore # Alternative formatting: # formatter = TextFormatter() @@ -851,7 +854,7 @@ class Mp3Converter(WavConverter): class ImageConverter(MediaConverter): """ - Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). + Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured). """ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: @@ -881,17 +884,17 @@ class ImageConverter(MediaConverter): md_content += f"{f}: {metadata[f]}\n" # Try describing the image with GPTV - mlm_client = kwargs.get("mlm_client") - mlm_model = kwargs.get("mlm_model") - if mlm_client is not None and mlm_model is not None: + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + if llm_client is not None and llm_model is not None: md_content += ( "\n# Description:\n" - + self._get_mlm_description( + + self._get_llm_description( local_path, extension, - mlm_client, - mlm_model, - prompt=kwargs.get("mlm_prompt"), + llm_client, + llm_model, + prompt=kwargs.get("llm_prompt"), ).strip() + "\n" ) @@ -901,11 +904,11 @@ class ImageConverter(MediaConverter): text_content=md_content, ) - def _get_mlm_description(self, local_path, extension, client, model, prompt=None): + def _get_llm_description(self, local_path, extension, client, model, prompt=None): if prompt is None or prompt.strip() == "": prompt = "Write a detailed caption for this image." - sys.stderr.write(f"MLM Prompt:\n{prompt}\n") + sys.stderr.write(f"llm Prompt:\n{prompt}\n") data_uri = "" with open(local_path, "rb") as image_file: @@ -1067,8 +1070,8 @@ class MarkItDown: def __init__( self, requests_session: Optional[requests.Session] = None, - mlm_client: Optional[Any] = None, - mlm_model: Optional[Any] = None, + llm_client: Optional[Any] = None, + llm_model: Optional[Any] = None, style_map: Optional[str] = None, ): if requests_session is None: @@ -1076,8 +1079,8 @@ class MarkItDown: else: self._requests_session = requests_session - self._mlm_client = mlm_client - self._mlm_model = mlm_model + self._llm_client = llm_client + self._llm_model = llm_model self._style_map = style_map self._page_converters: List[DocumentConverter] = [] @@ -1222,7 +1225,7 @@ class MarkItDown: self._append_ext(extensions, g) # Convert - result = self._convert(temp_path, extensions, url=response.url) + result = self._convert(temp_path, extensions, url=response.url, **kwargs) # Clean up finally: try: @@ -1249,11 +1252,12 @@ class MarkItDown: _kwargs.update({"file_extension": ext}) # Copy any additional global options - if "mlm_client" not in _kwargs and self._mlm_client is not None: - _kwargs["mlm_client"] = self._mlm_client + if "llm_client" not in _kwargs and self._llm_client is not None: + _kwargs["llm_client"] = self._llm_client + + if "llm_model" not in _kwargs and self._llm_model is not None: + _kwargs["llm_model"] = self._llm_model - if "mlm_model" not in _kwargs and self._mlm_model is not None: - _kwargs["mlm_model"] = self._mlm_model # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._page_converters @@ -1295,8 +1299,7 @@ class MarkItDown: if ext == "": return # if ext not in extensions: - if True: - extensions.append(ext) + extensions.append(ext) def _guess_ext_magic(self, path): """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""