Merge branch 'microsoft:main' into main
This commit is contained in:
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@@ -0,0 +1 @@
|
||||
*
|
||||
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.13-alpine
|
||||
|
||||
USER root
|
||||
|
||||
# Runtime dependency
|
||||
RUN apk add --no-cache ffmpeg
|
||||
|
||||
RUN pip install markitdown
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=10000
|
||||
ARG GROUPID=10000
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
ENTRYPOINT [ "markitdown" ]
|
||||
16
README.md
16
README.md
@@ -59,19 +59,26 @@ You can pipe content to standard input by omitting the argument:
|
||||
cat path-to-file.pdf | markitdown
|
||||
```
|
||||
|
||||
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client.
|
||||
|
||||
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client.
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI()
|
||||
md = MarkItDown(mlm_client=client, mlm_model="gpt-4o")
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
result = md.convert("example.jpg")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
You can also use the project as Docker Image:
|
||||
|
||||
```sh
|
||||
docker build -t markitdown:latest .
|
||||
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
@@ -88,15 +95,18 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
|
||||
|
||||
### Running Tests
|
||||
|
||||
To run the tests for this project, use the following command:
|
||||
To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install).
|
||||
|
||||
```sh
|
||||
pip install hatch
|
||||
hatch shell
|
||||
hatch test
|
||||
```
|
||||
|
||||
### Running Pre-commit Checks
|
||||
|
||||
Please run the pre-commit checks before submitting a PR.
|
||||
|
||||
```sh
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
@@ -2,21 +2,15 @@
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
import sys
|
||||
import argparse
|
||||
from ._markitdown import MarkItDown
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 1:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||
print(result.text_content)
|
||||
elif len(sys.argv) == 2:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(sys.argv[1])
|
||||
print(result.text_content)
|
||||
else:
|
||||
sys.stderr.write(
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert various file formats to markdown.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
usage="""
|
||||
SYNTAX:
|
||||
|
||||
markitdown <OPTIONAL: FILENAME>
|
||||
@@ -33,9 +27,20 @@ EXAMPLE:
|
||||
OR
|
||||
|
||||
markitdown < example.pdf
|
||||
""".strip()
|
||||
+ "\n"
|
||||
)
|
||||
""".strip(),
|
||||
)
|
||||
|
||||
parser.add_argument("filename", nargs="?")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.filename is None:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert_stream(sys.stdin.buffer)
|
||||
print(result.text_content)
|
||||
else:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(args.filename)
|
||||
print(result.text_content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -351,8 +351,11 @@ class YouTubeConverter(DocumentConverter):
|
||||
assert isinstance(params["v"][0], str)
|
||||
video_id = str(params["v"][0])
|
||||
try:
|
||||
youtube_transcript_languages = kwargs.get(
|
||||
"youtube_transcript_languages", ("en",)
|
||||
)
|
||||
# Must be a single transcript.
|
||||
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
|
||||
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
|
||||
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
||||
# Alternative formatting:
|
||||
# formatter = TextFormatter()
|
||||
@@ -851,7 +854,7 @@ class Mp3Converter(WavConverter):
|
||||
|
||||
class ImageConverter(MediaConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||
@@ -881,17 +884,17 @@ class ImageConverter(MediaConverter):
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Try describing the image with GPTV
|
||||
mlm_client = kwargs.get("mlm_client")
|
||||
mlm_model = kwargs.get("mlm_model")
|
||||
if mlm_client is not None and mlm_model is not None:
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
md_content += (
|
||||
"\n# Description:\n"
|
||||
+ self._get_mlm_description(
|
||||
+ self._get_llm_description(
|
||||
local_path,
|
||||
extension,
|
||||
mlm_client,
|
||||
mlm_model,
|
||||
prompt=kwargs.get("mlm_prompt"),
|
||||
llm_client,
|
||||
llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
).strip()
|
||||
+ "\n"
|
||||
)
|
||||
@@ -901,11 +904,11 @@ class ImageConverter(MediaConverter):
|
||||
text_content=md_content,
|
||||
)
|
||||
|
||||
def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
|
||||
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
|
||||
sys.stderr.write(f"llm Prompt:\n{prompt}\n")
|
||||
|
||||
data_uri = ""
|
||||
with open(local_path, "rb") as image_file:
|
||||
@@ -1067,8 +1070,8 @@ class MarkItDown:
|
||||
def __init__(
|
||||
self,
|
||||
requests_session: Optional[requests.Session] = None,
|
||||
mlm_client: Optional[Any] = None,
|
||||
mlm_model: Optional[Any] = None,
|
||||
llm_client: Optional[Any] = None,
|
||||
llm_model: Optional[Any] = None,
|
||||
style_map: Optional[str] = None,
|
||||
):
|
||||
if requests_session is None:
|
||||
@@ -1076,8 +1079,8 @@ class MarkItDown:
|
||||
else:
|
||||
self._requests_session = requests_session
|
||||
|
||||
self._mlm_client = mlm_client
|
||||
self._mlm_model = mlm_model
|
||||
self._llm_client = llm_client
|
||||
self._llm_model = llm_model
|
||||
self._style_map = style_map
|
||||
|
||||
self._page_converters: List[DocumentConverter] = []
|
||||
@@ -1222,7 +1225,7 @@ class MarkItDown:
|
||||
self._append_ext(extensions, g)
|
||||
|
||||
# Convert
|
||||
result = self._convert(temp_path, extensions, url=response.url)
|
||||
result = self._convert(temp_path, extensions, url=response.url, **kwargs)
|
||||
# Clean up
|
||||
finally:
|
||||
try:
|
||||
@@ -1249,11 +1252,12 @@ class MarkItDown:
|
||||
_kwargs.update({"file_extension": ext})
|
||||
|
||||
# Copy any additional global options
|
||||
if "mlm_client" not in _kwargs and self._mlm_client is not None:
|
||||
_kwargs["mlm_client"] = self._mlm_client
|
||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||
_kwargs["llm_client"] = self._llm_client
|
||||
|
||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||
_kwargs["llm_model"] = self._llm_model
|
||||
|
||||
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
||||
_kwargs["mlm_model"] = self._mlm_model
|
||||
# Add the list of converters for nested processing
|
||||
_kwargs["_parent_converters"] = self._page_converters
|
||||
|
||||
@@ -1295,8 +1299,7 @@ class MarkItDown:
|
||||
if ext == "":
|
||||
return
|
||||
# if ext not in extensions:
|
||||
if True:
|
||||
extensions.append(ext)
|
||||
extensions.append(ext)
|
||||
|
||||
def _guess_ext_magic(self, path):
|
||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
||||
|
||||
Reference in New Issue
Block a user