Merge branch 'microsoft:main' into main

This commit is contained in:
Om Gupta
2024-12-17 10:33:40 +05:30
committed by GitHub
5 changed files with 74 additions and 39 deletions

1
.dockerignore Normal file
View File

@@ -0,0 +1 @@
*

16
Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.13-alpine
USER root
# Runtime dependency
RUN apk add --no-cache ffmpeg
RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

View File

@@ -59,19 +59,26 @@ You can pipe content to standard input by omitting the argument:
cat path-to-file.pdf | markitdown cat path-to-file.pdf | markitdown
``` ```
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client.
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client.
```python ```python
from markitdown import MarkItDown from markitdown import MarkItDown
from openai import OpenAI from openai import OpenAI
client = OpenAI() client = OpenAI()
md = MarkItDown(mlm_client=client, mlm_model="gpt-4o") md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert("example.jpg") result = md.convert("example.jpg")
print(result.text_content) print(result.text_content)
``` ```
You can also use the project as Docker Image:
```sh
docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
```
## Contributing ## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a This project welcomes contributions and suggestions. Most contributions require you to agree to a
@@ -88,15 +95,18 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
### Running Tests ### Running Tests
To run the tests for this project, use the following command: To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install).
```sh ```sh
pip install hatch
hatch shell hatch shell
hatch test hatch test
``` ```
### Running Pre-commit Checks ### Running Pre-commit Checks
Please run the pre-commit checks before submitting a PR.
```sh ```sh
pre-commit run --all-files pre-commit run --all-files
``` ```

View File

@@ -2,21 +2,15 @@
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
import sys import sys
import argparse
from ._markitdown import MarkItDown from ._markitdown import MarkItDown
def main(): def main():
if len(sys.argv) == 1: parser = argparse.ArgumentParser(
markitdown = MarkItDown() description="Convert various file formats to markdown.",
result = markitdown.convert_stream(sys.stdin.buffer) formatter_class=argparse.RawDescriptionHelpFormatter,
print(result.text_content) usage="""
elif len(sys.argv) == 2:
markitdown = MarkItDown()
result = markitdown.convert(sys.argv[1])
print(result.text_content)
else:
sys.stderr.write(
"""
SYNTAX: SYNTAX:
markitdown <OPTIONAL: FILENAME> markitdown <OPTIONAL: FILENAME>
@@ -33,10 +27,21 @@ EXAMPLE:
OR OR
markitdown < example.pdf markitdown < example.pdf
""".strip() """.strip(),
+ "\n"
) )
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
if args.filename is None:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
print(result.text_content)
else:
markitdown = MarkItDown()
result = markitdown.convert(args.filename)
print(result.text_content)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -351,8 +351,11 @@ class YouTubeConverter(DocumentConverter):
assert isinstance(params["v"][0], str) assert isinstance(params["v"][0], str)
video_id = str(params["v"][0]) video_id = str(params["v"][0])
try: try:
youtube_transcript_languages = kwargs.get(
"youtube_transcript_languages", ("en",)
)
# Must be a single transcript. # Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
# Alternative formatting: # Alternative formatting:
# formatter = TextFormatter() # formatter = TextFormatter()
@@ -851,7 +854,7 @@ class Mp3Converter(WavConverter):
class ImageConverter(MediaConverter): class ImageConverter(MediaConverter):
""" """
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
@@ -881,17 +884,17 @@ class ImageConverter(MediaConverter):
md_content += f"{f}: {metadata[f]}\n" md_content += f"{f}: {metadata[f]}\n"
# Try describing the image with GPTV # Try describing the image with GPTV
mlm_client = kwargs.get("mlm_client") llm_client = kwargs.get("llm_client")
mlm_model = kwargs.get("mlm_model") llm_model = kwargs.get("llm_model")
if mlm_client is not None and mlm_model is not None: if llm_client is not None and llm_model is not None:
md_content += ( md_content += (
"\n# Description:\n" "\n# Description:\n"
+ self._get_mlm_description( + self._get_llm_description(
local_path, local_path,
extension, extension,
mlm_client, llm_client,
mlm_model, llm_model,
prompt=kwargs.get("mlm_prompt"), prompt=kwargs.get("llm_prompt"),
).strip() ).strip()
+ "\n" + "\n"
) )
@@ -901,11 +904,11 @@ class ImageConverter(MediaConverter):
text_content=md_content, text_content=md_content,
) )
def _get_mlm_description(self, local_path, extension, client, model, prompt=None): def _get_llm_description(self, local_path, extension, client, model, prompt=None):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."
sys.stderr.write(f"MLM Prompt:\n{prompt}\n") sys.stderr.write(f"llm Prompt:\n{prompt}\n")
data_uri = "" data_uri = ""
with open(local_path, "rb") as image_file: with open(local_path, "rb") as image_file:
@@ -1067,8 +1070,8 @@ class MarkItDown:
def __init__( def __init__(
self, self,
requests_session: Optional[requests.Session] = None, requests_session: Optional[requests.Session] = None,
mlm_client: Optional[Any] = None, llm_client: Optional[Any] = None,
mlm_model: Optional[Any] = None, llm_model: Optional[Any] = None,
style_map: Optional[str] = None, style_map: Optional[str] = None,
): ):
if requests_session is None: if requests_session is None:
@@ -1076,8 +1079,8 @@ class MarkItDown:
else: else:
self._requests_session = requests_session self._requests_session = requests_session
self._mlm_client = mlm_client self._llm_client = llm_client
self._mlm_model = mlm_model self._llm_model = llm_model
self._style_map = style_map self._style_map = style_map
self._page_converters: List[DocumentConverter] = [] self._page_converters: List[DocumentConverter] = []
@@ -1222,7 +1225,7 @@ class MarkItDown:
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Convert # Convert
result = self._convert(temp_path, extensions, url=response.url) result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up # Clean up
finally: finally:
try: try:
@@ -1249,11 +1252,12 @@ class MarkItDown:
_kwargs.update({"file_extension": ext}) _kwargs.update({"file_extension": ext})
# Copy any additional global options # Copy any additional global options
if "mlm_client" not in _kwargs and self._mlm_client is not None: if "llm_client" not in _kwargs and self._llm_client is not None:
_kwargs["mlm_client"] = self._mlm_client _kwargs["llm_client"] = self._llm_client
if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["llm_model"] = self._llm_model
if "mlm_model" not in _kwargs and self._mlm_model is not None:
_kwargs["mlm_model"] = self._mlm_model
# Add the list of converters for nested processing # Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters _kwargs["_parent_converters"] = self._page_converters
@@ -1295,7 +1299,6 @@ class MarkItDown:
if ext == "": if ext == "":
return return
# if ext not in extensions: # if ext not in extensions:
if True:
extensions.append(ext) extensions.append(ext)
def _guess_ext_magic(self, path): def _guess_ext_magic(self, path):