15 Commits

Author SHA1 Message Date
Adam Fourney
e58bc486ee Added missing comma. 2025-03-07 16:18:47 -08:00
afourney
81ef601c09 Removed deprecation and other warnings. (#1105) 2025-03-07 16:17:03 -08:00
afourney
518b12c1fb Addresses #1068 (#1101) 2025-03-07 15:46:30 -08:00
Adam Fourney
8eaf5a1da9 Clean up README.md 2025-03-05 21:35:08 -08:00
afourney
38c924793c Bump version (#1095) 2025-03-05 21:30:56 -08:00
afourney
b9526d5e47 Bump version. (#1075) 2025-02-28 07:30:46 -08:00
Hieu Lam
519fe172aa Unable to convert HTML to Markdown (#1072)
* feat: issue where inherited function from `markdownify.MarkdownConverter` doesn't have `current_tags` leading to error using `kwargs`, also set default value for `convert_as_inline`
2025-02-28 00:57:41 -08:00
Adam Fourney
abe9752438 Bumped version 2025-02-10 16:01:17 -08:00
wunde005
73ba69d8cd For csv files mimetypes.guess_type is returning "application/vnd.ms-excel" on windows causing an invalid mime type in plaintextconverter. In reference to issue: https://github.com/microsoft/markitdown/issues/150 (#273) 2025-02-08 20:58:13 -08:00
Werner Robitza
2a4f7bb6a8 fix: argparse CLI option ordering, fixes #268 (#290)
* fix: argparse CLI option ordering, fixes #268
* Fixed formatting.
2025-02-08 20:50:38 -08:00
masquare
7cf5e0bb23 feat(pptx): support image description with LLM for pptx files (#306) 2025-02-08 20:37:34 -08:00
James Hickey
3090917a49 Typo fixed (#270) 2025-02-08 20:30:13 -08:00
ZeyuTeng96
7bea2672a0 remove leading and trailing \n for HtmlConverter (#262) 2025-02-08 20:28:35 -08:00
KennyZhang1
bf6a15e9b5 Kennyzhang/docintel docs (#312)
* updated docs to include doc intelligence

* include reference to doc intel setup docs
2025-01-31 22:23:26 -08:00
KennyZhang1
bfde857420 Add support for conversion via Document Intelligence (#303)
* added cli params for doc intel

* added DocumentIntelligenceConverter class implementation

* initialized doc intel client instance field

* added isolated doc_intel main conversion function

* temp fix for ContentFormat import bug

* ran tests for docintel and offline for many filetypes

* push doc intel converter to the top of the stack

* formatting changes

* modified project toml file
2025-01-24 14:09:32 -08:00
6 changed files with 249 additions and 152 deletions

View File

@@ -33,12 +33,20 @@ Or use `-o` to specify the output file:
markitdown path-to-file.pdf -o document.md markitdown path-to-file.pdf -o document.md
``` ```
To use Document Intelligence conversion:
```bash
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
```
You can also pipe content: You can also pipe content:
```bash ```bash
cat path-to-file.pdf | markitdown cat path-to-file.pdf | markitdown
``` ```
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
### Python API ### Python API
Basic usage in Python: Basic usage in Python:
@@ -51,6 +59,16 @@ result = md.convert("test.xlsx")
print(result.text_content) print(result.text_content)
``` ```
Document Intelligence conversion in Python:
```python
from markitdown import MarkItDown
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert("test.pdf")
print(result.text_content)
```
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
```python ```python
@@ -69,42 +87,6 @@ print(result.text_content)
docker build -t markitdown:latest . docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
``` ```
<details>
<summary>Batch Processing Multiple Files</summary>
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!")
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
Note that original files will remain unchanged and new markdown files are created with the same base name.
</details>
## Contributing ## Contributing

View File

@@ -42,6 +42,8 @@ dependencies = [
"pathvalidate", "pathvalidate",
"charset-normalizer", "charset-normalizer",
"openai", "openai",
"azure-ai-documentintelligence",
"azure-identity"
] ]
[project.urls] [project.urls]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a3" __version__ = "0.0.2"

View File

@@ -51,22 +51,46 @@ def main():
help="show the version number and exit", help="show the version number and exit",
) )
parser.add_argument("filename", nargs="?")
parser.add_argument( parser.add_argument(
"-o", "-o",
"--output", "--output",
help="Output file name. If not provided, output is written to stdout.", help="Output file name. If not provided, output is written to stdout.",
) )
parser.add_argument(
"-d",
"--use-docintel",
action="store_true",
help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
)
parser.add_argument(
"-e",
"--endpoint",
type=str,
help="Document Intelligence Endpoint. Required if using Document Intelligence.",
)
parser.add_argument("filename", nargs="?")
args = parser.parse_args() args = parser.parse_args()
if args.filename is None: if args.use_docintel:
markitdown = MarkItDown() if args.endpoint is None:
result = markitdown.convert_stream(sys.stdin.buffer) raise ValueError(
_handle_output(args, result) "Document Intelligence Endpoint is required when using Document Intelligence."
)
elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
else: else:
markitdown = MarkItDown() markitdown = MarkItDown()
if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer)
else:
result = markitdown.convert(args.filename) result = markitdown.convert(args.filename)
_handle_output(args, result)
_handle_output(args, result)
def _handle_output(args, result: DocumentConverterResult): def _handle_output(args, result: DocumentConverterResult):

View File

@@ -17,7 +17,7 @@ from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings from warnings import warn, filterwarnings
import mammoth import mammoth
import markdownify import markdownify
@@ -33,23 +33,32 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from charset_normalizer import from_path from charset_normalizer import from_path
# Azure imports
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
AnalyzeDocumentRequest,
AnalyzeResult,
DocumentAnalysisFeature,
)
from azure.identity import DefaultAzureCredential
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
# This constant is a temporary fix until the bug is resolved.
CONTENT_FORMAT = "markdown"
# Override mimetype for csv to fix issue on windows
mimetypes.add_type("text/csv", ".csv")
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
try: try:
# Using warnings' catch_warnings to catch import pydub
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@@ -75,7 +84,14 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
# Explicitly cast options to the expected type if necessary # Explicitly cast options to the expected type if necessary
super().__init__(**options) super().__init__(**options)
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: def convert_hn(
self,
n: int,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Same as usual, but be sure to start with a new line""" """Same as usual, but be sure to start with a new line"""
if not convert_as_inline: if not convert_as_inline:
if not re.search(r"^\n", text): if not re.search(r"^\n", text):
@@ -83,7 +99,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(self, el: Any, text: str, convert_as_inline: bool): def convert_a(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
):
"""Same as usual converter, but removes Javascript links and escapes URIs.""" """Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text: if not text:
@@ -119,7 +137,9 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
else text else text
) )
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: def convert_img(
self, el: Any, text: str, convert_as_inline: Optional[bool] = False, **kwargs
) -> str:
"""Same as usual converter, but removes data URIs""" """Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or "" alt = el.attrs.get("alt", None) or ""
@@ -204,7 +224,7 @@ class HtmlConverter(DocumentConverter):
return result return result
def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
"""Helper function that converts and HTML string.""" """Helper function that converts an HTML string."""
# Parse the string # Parse the string
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
@@ -223,6 +243,9 @@ class HtmlConverter(DocumentConverter):
assert isinstance(webpage_text, str) assert isinstance(webpage_text, str)
# remove leading and trailing \n
webpage_text = webpage_text.strip()
return DocumentConverterResult( return DocumentConverterResult(
title=None if soup.title is None else soup.title.string, title=None if soup.title is None else soup.title.string,
text_content=webpage_text, text_content=webpage_text,
@@ -771,6 +794,35 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text. Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
""" """
def _get_llm_description(
self, llm_client, llm_model, image_blob, content_type, prompt=None
):
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed alt text for this image with less than 50 words."
image_base64 = base64.b64encode(image_blob).decode("utf-8")
data_uri = f"data:{content_type};base64,{image_base64}"
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": data_uri,
},
},
{"type": "text", "text": prompt},
],
}
]
response = llm_client.chat.completions.create(
model=llm_model, messages=messages
)
return response.choices[0].message.content
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PPTX # Bail if not a PPTX
extension = kwargs.get("file_extension", "") extension = kwargs.get("file_extension", "")
@@ -791,17 +843,38 @@ class PptxConverter(HtmlConverter):
# Pictures # Pictures
if self._is_picture(shape): if self._is_picture(shape):
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
alt_text = ""
try: llm_description = None
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") alt_text = None
except Exception:
pass llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
try:
llm_description = self._get_llm_description(
llm_client,
llm_model,
shape.image.blob,
shape.image.content_type,
)
except Exception:
# Unable to describe with LLM
pass
if not llm_description:
try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
"descr", ""
)
except Exception:
# Unable to get alt text
pass
# A placeholder name # A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg" filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += ( md_content += (
"\n![" "\n!["
+ (alt_text if alt_text else shape.name) + (llm_description or alt_text or shape.name)
+ "](" + "]("
+ filename + filename
+ ")\n" + ")\n"
@@ -894,18 +967,6 @@ class MediaConverter(DocumentConverter):
def _get_metadata(self, local_path, exiftool_path=None): def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None return None
else: else:
try: try:
@@ -1008,6 +1069,14 @@ class Mp3Converter(WavConverter):
handle, temp_path = tempfile.mkstemp(suffix=".wav") handle, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(handle) os.close(handle)
try: try:
# Check if pydub defaulted to ffmpeg
if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
"ffmpeg"
):
warn(
"pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
RuntimeWarning,
)
sound = pydub.AudioSegment.from_mp3(local_path) sound = pydub.AudioSegment.from_mp3(local_path)
sound.export(temp_path, format="wav") sound.export(temp_path, format="wav")
@@ -1318,6 +1387,74 @@ class ZipConverter(DocumentConverter):
) )
class DocumentIntelligenceConverter(DocumentConverter):
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
def __init__(
self,
endpoint: str,
api_version: str = "2024-07-31-preview",
):
self.endpoint = endpoint
self.api_version = api_version
self.doc_intel_client = DocumentIntelligenceClient(
endpoint=self.endpoint,
api_version=self.api_version,
credential=DefaultAzureCredential(),
)
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if extension is not supported by Document Intelligence
extension = kwargs.get("file_extension", "")
docintel_extensions = [
".pdf",
".docx",
".xlsx",
".pptx",
".html",
".jpeg",
".jpg",
".png",
".bmp",
".tiff",
".heif",
]
if extension.lower() not in docintel_extensions:
return None
# Get the bytestring for the local path
with open(local_path, "rb") as f:
file_bytes = f.read()
# Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
if extension.lower() in [".xlsx", ".pptx", ".html"]:
analysis_features = []
else:
analysis_features = [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
]
# Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=file_bytes),
features=analysis_features,
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
)
result: AnalyzeResult = poller.result()
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
return DocumentConverterResult(
title=None,
text_content=markdown_text,
)
class FileConversionException(BaseException): class FileConversionException(BaseException):
pass pass
@@ -1337,6 +1474,7 @@ class MarkItDown:
llm_model: Optional[str] = None, llm_model: Optional[str] = None,
style_map: Optional[str] = None, style_map: Optional[str] = None,
exiftool_path: Optional[str] = None, exiftool_path: Optional[str] = None,
docintel_endpoint: Optional[str] = None,
# Deprecated # Deprecated
mlm_client: Optional[Any] = None, mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None, mlm_model: Optional[str] = None,
@@ -1349,34 +1487,26 @@ class MarkItDown:
if exiftool_path is None: if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH") exiftool_path = os.environ.get("EXIFTOOL_PATH")
# Handle deprecation notices # Still none? Check well-known paths
############################# if exiftool_path is None:
if mlm_client is not None: candidate = shutil.which("exiftool")
if llm_client is None: if candidate:
warn( candidate = os.path.abspath(candidate)
"'mlm_client' is deprecated, and was renamed 'llm_client'.", if any(
DeprecationWarning, d == os.path.dirname(candidate)
) for d in [
llm_client = mlm_client "/usr/bin",
mlm_client = None "/usr/local/bin",
else: "/opt",
raise ValueError( "/opt/bin",
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead." "/opt/local/bin",
) "/opt/homebrew/bin",
"C:\\Windows\\System32",
if mlm_model is not None: "C:\\Program Files",
if llm_model is None: "C:\\Program Files (x86)",
warn( ]
"'mlm_model' is deprecated, and was renamed 'llm_model'.", ):
DeprecationWarning, exiftool_path = candidate
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client self._llm_client = llm_client
self._llm_model = llm_model self._llm_model = llm_model
@@ -1406,6 +1536,12 @@ class MarkItDown:
self.register_page_converter(ZipConverter()) self.register_page_converter(ZipConverter())
self.register_page_converter(OutlookMsgConverter()) self.register_page_converter(OutlookMsgConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
if docintel_endpoint is not None:
self.register_page_converter(
DocumentIntelligenceConverter(endpoint=docintel_endpoint)
)
def convert( def convert(
self, source: Union[str, requests.Response, Path], **kwargs: Any self, source: Union[str, requests.Response, Path], **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs ) -> DocumentConverterResult: # TODO: deal with kwargs
@@ -1608,6 +1744,8 @@ class MarkItDown:
ext = ext.strip() ext = ext.strip()
if ext == "": if ext == "":
return return
if ext in extensions:
return
# if ext not in extensions: # if ext not in extensions:
extensions.append(ext) extensions.append(ext)

View File

@@ -6,8 +6,6 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
@@ -277,18 +275,6 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
) )
def test_markitdown_exiftool() -> None: def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test explicitly setting the location of exiftool # Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool) markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -306,40 +292,6 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif( @pytest.mark.skipif(
skip_llm, skip_llm,
reason="do not run llm tests without a key", reason="do not run llm tests without a key",
@@ -364,5 +316,4 @@ if __name__ == "__main__":
# test_markitdown_remote() # test_markitdown_remote()
# test_markitdown_local() # test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm() # test_markitdown_llm()