Merge branch 'main' into main
This commit is contained in:
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*
|
||||||
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
FROM python:3.13-alpine
|
||||||
|
|
||||||
|
USER root
|
||||||
|
|
||||||
|
# Runtime dependency
|
||||||
|
RUN apk add --no-cache ffmpeg
|
||||||
|
|
||||||
|
RUN pip install markitdown
|
||||||
|
|
||||||
|
# Default USERID and GROUPID
|
||||||
|
ARG USERID=10000
|
||||||
|
ARG GROUPID=10000
|
||||||
|
|
||||||
|
USER $USERID:$GROUPID
|
||||||
|
|
||||||
|
ENTRYPOINT [ "markitdown" ]
|
||||||
17
README.md
17
README.md
@@ -59,24 +59,24 @@ You can pipe content to standard input by omitting the argument:
|
|||||||
cat path-to-file.pdf | markitdown
|
cat path-to-file.pdf | markitdown
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client.
|
||||||
|
|
||||||
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `mlm_client` and `mlm_model` parameters to MarkItDown object, according to your specific client.
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
client = OpenAI()
|
client = OpenAI()
|
||||||
md = MarkItDown(mlm_client=client, mlm_model="gpt-4o")
|
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||||
result = md.convert("example.jpg")
|
result = md.convert("example.jpg")
|
||||||
print(result.text_content)
|
print(result.text_content)
|
||||||
```
|
```
|
||||||
|
|
||||||
The prompt of describing images can be customized by providing `mlm_prompt` parameter.
|
You can also use the project as Docker Image:
|
||||||
|
|
||||||
```python
|
```sh
|
||||||
# ...
|
docker build -t markitdown:latest .
|
||||||
result = md.convert("example.jpg", mlm_prompt="Customized prompt")
|
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
||||||
```
|
```
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
@@ -95,15 +95,18 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
|
|||||||
|
|
||||||
### Running Tests
|
### Running Tests
|
||||||
|
|
||||||
To run the tests for this project, use the following command:
|
To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install).
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
pip install hatch
|
||||||
hatch shell
|
hatch shell
|
||||||
hatch test
|
hatch test
|
||||||
```
|
```
|
||||||
|
|
||||||
### Running Pre-commit Checks
|
### Running Pre-commit Checks
|
||||||
|
|
||||||
|
Please run the pre-commit checks before submitting a PR.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
pre-commit run --all-files
|
pre-commit run --all-files
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -793,7 +793,7 @@ class Mp3Converter(WavConverter):
|
|||||||
|
|
||||||
class ImageConverter(MediaConverter):
|
class ImageConverter(MediaConverter):
|
||||||
"""
|
"""
|
||||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
|
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
|
||||||
@@ -823,17 +823,17 @@ class ImageConverter(MediaConverter):
|
|||||||
md_content += f"{f}: {metadata[f]}\n"
|
md_content += f"{f}: {metadata[f]}\n"
|
||||||
|
|
||||||
# Try describing the image with GPTV
|
# Try describing the image with GPTV
|
||||||
mlm_client = kwargs.get("mlm_client")
|
llm_client = kwargs.get("llm_client")
|
||||||
mlm_model = kwargs.get("mlm_model")
|
llm_model = kwargs.get("llm_model")
|
||||||
if mlm_client is not None and mlm_model is not None:
|
if llm_client is not None and llm_model is not None:
|
||||||
md_content += (
|
md_content += (
|
||||||
"\n# Description:\n"
|
"\n# Description:\n"
|
||||||
+ self._get_mlm_description(
|
+ self._get_llm_description(
|
||||||
local_path,
|
local_path,
|
||||||
extension,
|
extension,
|
||||||
mlm_client,
|
llm_client,
|
||||||
mlm_model,
|
llm_model,
|
||||||
prompt=kwargs.get("mlm_prompt"),
|
prompt=kwargs.get("llm_prompt"),
|
||||||
).strip()
|
).strip()
|
||||||
+ "\n"
|
+ "\n"
|
||||||
)
|
)
|
||||||
@@ -843,11 +843,11 @@ class ImageConverter(MediaConverter):
|
|||||||
text_content=md_content,
|
text_content=md_content,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
|
def _get_llm_description(self, local_path, extension, client, model, prompt=None):
|
||||||
if prompt is None or prompt.strip() == "":
|
if prompt is None or prompt.strip() == "":
|
||||||
prompt = "Write a detailed caption for this image."
|
prompt = "Write a detailed caption for this image."
|
||||||
|
|
||||||
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
|
sys.stderr.write(f"llm Prompt:\n{prompt}\n")
|
||||||
|
|
||||||
data_uri = ""
|
data_uri = ""
|
||||||
with open(local_path, "rb") as image_file:
|
with open(local_path, "rb") as image_file:
|
||||||
@@ -1009,8 +1009,8 @@ class MarkItDown:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
requests_session: Optional[requests.Session] = None,
|
requests_session: Optional[requests.Session] = None,
|
||||||
mlm_client: Optional[Any] = None,
|
llm_client: Optional[Any] = None,
|
||||||
mlm_model: Optional[Any] = None,
|
llm_model: Optional[Any] = None,
|
||||||
style_map: Optional[str] = None,
|
style_map: Optional[str] = None,
|
||||||
):
|
):
|
||||||
if requests_session is None:
|
if requests_session is None:
|
||||||
@@ -1018,8 +1018,8 @@ class MarkItDown:
|
|||||||
else:
|
else:
|
||||||
self._requests_session = requests_session
|
self._requests_session = requests_session
|
||||||
|
|
||||||
self._mlm_client = mlm_client
|
self._llm_client = llm_client
|
||||||
self._mlm_model = mlm_model
|
self._llm_model = llm_model
|
||||||
self._style_map = style_map
|
self._style_map = style_map
|
||||||
|
|
||||||
self._page_converters: List[DocumentConverter] = []
|
self._page_converters: List[DocumentConverter] = []
|
||||||
@@ -1190,11 +1190,12 @@ class MarkItDown:
|
|||||||
_kwargs.update({"file_extension": ext})
|
_kwargs.update({"file_extension": ext})
|
||||||
|
|
||||||
# Copy any additional global options
|
# Copy any additional global options
|
||||||
if "mlm_client" not in _kwargs and self._mlm_client is not None:
|
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||||
_kwargs["mlm_client"] = self._mlm_client
|
_kwargs["llm_client"] = self._llm_client
|
||||||
|
|
||||||
|
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||||
|
_kwargs["llm_model"] = self._llm_model
|
||||||
|
|
||||||
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
|
||||||
_kwargs["mlm_model"] = self._mlm_model
|
|
||||||
# Add the list of converters for nested processing
|
# Add the list of converters for nested processing
|
||||||
_kwargs["_parent_converters"] = self._page_converters
|
_kwargs["_parent_converters"] = self._page_converters
|
||||||
|
|
||||||
@@ -1236,8 +1237,7 @@ class MarkItDown:
|
|||||||
if ext == "":
|
if ext == "":
|
||||||
return
|
return
|
||||||
# if ext not in extensions:
|
# if ext not in extensions:
|
||||||
if True:
|
extensions.append(ext)
|
||||||
extensions.append(ext)
|
|
||||||
|
|
||||||
def _guess_ext_magic(self, path):
|
def _guess_ext_magic(self, path):
|
||||||
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
||||||
|
|||||||
Reference in New Issue
Block a user