1 Commits

Author SHA1 Message Date
Josh Bradley
33a0cd8efe small formatting change 2025-01-14 18:04:14 -05:00
7 changed files with 152 additions and 250 deletions

View File

@@ -17,7 +17,6 @@ RUN pip install markitdown
# Default USERID and GROUPID # Default USERID and GROUPID
ARG USERID=10000 ARG USERID=10000
ARG GROUPID=10000 ARG GROUPID=10000
USER $USERID:$GROUPID USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ] ENTRYPOINT [ "markitdown" ]

View File

@@ -33,20 +33,12 @@ Or use `-o` to specify the output file:
markitdown path-to-file.pdf -o document.md markitdown path-to-file.pdf -o document.md
``` ```
To use Document Intelligence conversion:
```bash
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
```
You can also pipe content: You can also pipe content:
```bash ```bash
cat path-to-file.pdf | markitdown cat path-to-file.pdf | markitdown
``` ```
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
### Python API ### Python API
Basic usage in Python: Basic usage in Python:
@@ -59,16 +51,6 @@ result = md.convert("test.xlsx")
print(result.text_content) print(result.text_content)
``` ```
Document Intelligence conversion in Python:
```python
from markitdown import MarkItDown
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert("test.pdf")
print(result.text_content)
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python ```python
@@ -87,6 +69,42 @@ print(result.text_content)
docker build -t markitdown:latest . docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
``` ```
<details>
<summary>Batch Processing Multiple Files</summary>
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!")
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
Note that original files will remain unchanged and new markdown files are created with the same base name.
</details>
## Contributing ## Contributing

View File

@@ -42,8 +42,6 @@ dependencies = [
"pathvalidate", "pathvalidate",
"charset-normalizer", "charset-normalizer",
"openai", "openai",
"azure-ai-documentintelligence",
"azure-identity"
] ]
[project.urls] [project.urls]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.2" __version__ = "0.0.1a3"

View File

@@ -51,46 +51,22 @@ def main():
help="show the version number and exit", help="show the version number and exit",
) )
parser.add_argument("filename", nargs="?")
parser.add_argument( parser.add_argument(
"-o", "-o",
"--output", "--output",
help="Output file name. If not provided, output is written to stdout.", help="Output file name. If not provided, output is written to stdout.",
) )
parser.add_argument(
"-d",
"--use-docintel",
action="store_true",
help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
)
parser.add_argument(
"-e",
"--endpoint",
type=str,
help="Document Intelligence Endpoint. Required if using Document Intelligence.",
)
parser.add_argument("filename", nargs="?")
args = parser.parse_args() args = parser.parse_args()
if args.use_docintel: if args.filename is None:
if args.endpoint is None: markitdown = MarkItDown()
raise ValueError( result = markitdown.convert_stream(sys.stdin.buffer)
"Document Intelligence Endpoint is required when using Document Intelligence." _handle_output(args, result)
)
elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
else: else:
markitdown = MarkItDown() markitdown = MarkItDown()
if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer)
else:
result = markitdown.convert(args.filename) result = markitdown.convert(args.filename)
_handle_output(args, result)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult): def _handle_output(args, result: DocumentConverterResult):

View File

@@ -17,7 +17,7 @@ from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, filterwarnings from warnings import warn, resetwarnings, catch_warnings
import mammoth import mammoth
import markdownify import markdownify
@@ -33,32 +33,23 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from charset_normalizer import from_path from charset_normalizer import from_path
# Azure imports
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved.
CONTENT_FORMAT = "markdown"
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
try: try:
import pydub # Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@@ -84,14 +75,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
def convert_hn( def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line""" """Same as usual, but be sure to start with a new line"""
if not convert_as_inline: if not convert_as_inline:
if not re.search(r"^\n", text): if not re.search(r"^\n", text):
@@ -99,9 +83,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a( def convert_a(self, el: Any, text: str, convert_as_inline: bool):
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
):
"""Same as usual converter, but removes Javascript links and escapes URIs.""" """Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text: if not text:
@@ -137,9 +119,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text else text
) )
def convert_img( def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
) -> str:
"""Same as usual converter, but removes data URIs""" """Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or "" alt = el.attrs.get("alt", None) or ""
@@ -224,7 +204,7 @@ class HtmlConverter(DocumentConverter):
return result return result
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
"""Helper function that converts an HTML string.""" """Helper function that converts and HTML string."""
# Parse the string # Parse the string
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
@@ -243,9 +223,6 @@ class HtmlConverter(DocumentConverter):
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)
# remove leading and trailing \n
webpage_text = webpage_text.strip()
return DocumentConverterResult( return DocumentConverterResult(
title=None if soup.title is None else soup.title.string, title=None if soup.title is None else soup.title.string,
text_content=webpage_text, text_content=webpage_text,
@@ -794,35 +771,6 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text. Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
""" """
def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed alt text for this image with less than 50 words."
image_base64 = base64.b64encode(image_blob).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
},
{"type": "text", "text": prompt},
],
}
]
response = llm_client.chat.completions.create(
model=llm_model, messages=messages
)
return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX # Bail if not a PPTX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -843,38 +791,17 @@ class PptxConverter(HtmlConverter):
# Pictures # Pictures
if self._is_picture(shape): if self._is_picture(shape):
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
alt_text = ""
llm_description = None try:
alt_text = None alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
except Exception:
llm_client = kwargs.get("llm_client") pass
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
try:
llm_description = self._get_llm_description(
llm_client,
llm_model,
shape.image.blob,
shape.image.content_type,
)
except Exception:
# Unable to describe with LLM
pass
if not llm_description:
try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
"descr", ""
)
except Exception:
# Unable to get alt text
pass
# A placeholder name # A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg" filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += ( md_content += (
"\n![" "\n!["
+ (llm_description or alt_text or shape.name) + (alt_text if alt_text else shape.name)
+ "](" + "]("
+ filename + filename
+ ")\n" + ")\n"
@@ -967,6 +894,18 @@ class MediaConverter(DocumentConverter):
def _get_metadata(self, local_path, exiftool_path=None): def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None return None
else: else:
try: try:
@@ -1069,14 +1008,6 @@ class Mp3Converter(WavConverter):
handle, temp_path = tempfile.mkstemp(suffix=".wav") handle, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(handle) os.close(handle)
try: try:
# Check if pydub defaulted to ffmpeg
if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
"ffmpeg"
):
warn(
"pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
RuntimeWarning,
)
sound = pydub.AudioSegment.from_mp3(local_path) sound = pydub.AudioSegment.from_mp3(local_path)
sound.export(temp_path, format="wav") sound.export(temp_path, format="wav")
@@ -1387,74 +1318,6 @@ class ZipConverter(DocumentConverter):
) )
class DocumentIntelligenceConverter(DocumentConverter):
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
def __init__(
self,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(
endpoint=self.endpoint,
api_version=self.api_version,
credential=DefaultAzureCredential(),
)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "")
docintel_extensions = [
".pdf",
".docx",
".xlsx",
".pptx",
".html",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
if extension.lower() not in docintel_extensions:
return None
# Get the bytestring for the local path
with open(local_path, "rb") as f:
file_bytes = f.read()
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
if extension.lower() in [".xlsx", ".pptx", ".html"]:
analysis_features = []
else:
analysis_features = [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
]
# Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
features=analysis_features,
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
)
result: AnalyzeResult = poller.result()
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
return DocumentConverterResult(
title=None,
text_content=markdown_text,
)
class FileConversionException(BaseException): class FileConversionException(BaseException):
pass pass
@@ -1474,7 +1337,6 @@ class MarkItDown:
llm_model: Optional[str] = None, llm_model: Optional[str] = None,
style_map: Optional[str] = None, style_map: Optional[str] = None,
exiftool_path: Optional[str] = None, exiftool_path: Optional[str] = None,
docintel_endpoint: Optional[str] = None,
# Deprecated # Deprecated
mlm_client: Optional[Any] = None, mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None, mlm_model: Optional[str] = None,
@@ -1487,26 +1349,34 @@ class MarkItDown:
if exiftool_path is None: if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH") exiftool_path = os.environ.get("EXIFTOOL_PATH")
# Still none? Check well-known paths # Handle deprecation notices
if exiftool_path is None: #############################
candidate = shutil.which("exiftool") if mlm_client is not None:
if candidate: if llm_client is None:
candidate = os.path.abspath(candidate) warn(
if any( "'mlm_client' is deprecated, and was renamed 'llm_client'.",
d == os.path.dirname(candidate) DeprecationWarning,
for d in [ )
"/usr/bin", llm_client = mlm_client
"/usr/local/bin", mlm_client = None
"/opt", else:
"/opt/bin", raise ValueError(
"/opt/local/bin", "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
"/opt/homebrew/bin", )
"C:\\Windows\\System32",
"C:\\Program Files", if mlm_model is not None:
"C:\\Program Files (x86)", if llm_model is None:
] warn(
): "'mlm_model' is deprecated, and was renamed 'llm_model'.",
exiftool_path = candidate DeprecationWarning,
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client self._llm_client = llm_client
self._llm_model = llm_model self._llm_model = llm_model
@@ -1536,12 +1406,6 @@ class MarkItDown:
self.register_page_converter(ZipConverter()) self.register_page_converter(ZipConverter())
self.register_page_converter(OutlookMsgConverter()) self.register_page_converter(OutlookMsgConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None:
self.register_page_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
)
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
@@ -1744,8 +1608,6 @@ class MarkItDown:
ext = ext.strip() ext = ext.strip()
if ext == "": if ext == "":
return return
if ext in extensions:
return
# if ext not in extensions: # if ext not in extensions:
extensions.append(ext) extensions.append(ext)

View File

@@ -6,6 +6,8 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
@@ -275,6 +277,18 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
) )
def test_markitdown_exiftool() -> None: def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test explicitly setting the location of exiftool # Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool) markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -292,6 +306,40 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif( @pytest.mark.skipif(
skip_llm, skip_llm,
reason="do not run llm tests without a key", reason="do not run llm tests without a key",
@@ -316,4 +364,5 @@ if __name__ == "__main__":
# test_markitdown_remote() # test_markitdown_remote()
# test_markitdown_local() # test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm() # test_markitdown_llm()