1 Commits

Author SHA1 Message Date
Adam Fourney
326d17b802 Bump version. 2025-02-28 07:29:12 -08:00
5 changed files with 143 additions and 34 deletions

View File

@@ -87,6 +87,42 @@ print(result.text_content)
docker build -t markitdown:latest . docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
``` ```
<details>
<summary>Batch Processing Multiple Files</summary>
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!")
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
Note that original files will remain unchanged and new markdown files are created with the same base name.
</details>
## Contributing ## Contributing

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.2" __version__ = "0.0.1a5"

View File

@@ -3,6 +3,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
import argparse import argparse
import sys import sys
import shutil
from textwrap import dedent from textwrap import dedent
from .__about__ import __version__ from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult from ._markitdown import MarkItDown, DocumentConverterResult
@@ -74,6 +75,8 @@ def main():
parser.add_argument("filename", nargs="?") parser.add_argument("filename", nargs="?")
args = parser.parse_args() args = parser.parse_args()
which_exiftool = shutil.which("exiftool")
if args.use_docintel: if args.use_docintel:
if args.endpoint is None: if args.endpoint is None:
raise ValueError( raise ValueError(
@@ -81,9 +84,11 @@ def main():
) )
elif args.filename is None: elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.") raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(docintel_endpoint=args.endpoint) markitdown = MarkItDown(
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
)
else: else:
markitdown = MarkItDown() markitdown = MarkItDown(exiftool_path=which_exiftool)
if args.filename is None: if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer) result = markitdown.convert_stream(sys.stdin.buffer)

View File

@@ -17,7 +17,7 @@ from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, filterwarnings from warnings import warn, resetwarnings, catch_warnings
import mammoth import mammoth
import markdownify import markdownify
@@ -51,14 +51,21 @@ mimetypes.add_type("text/csv", ".csv")
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
try: try:
import pydub # Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@@ -967,6 +974,18 @@ class MediaConverter(DocumentConverter):
def _get_metadata(self, local_path, exiftool_path=None): def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None return None
else: else:
try: try:
@@ -1069,14 +1088,6 @@ class Mp3Converter(WavConverter):
handle, temp_path = tempfile.mkstemp(suffix=".wav") handle, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(handle) os.close(handle)
try: try:
# Check if pydub defaulted to ffmpeg
if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
"ffmpeg"
):
warn(
"pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
RuntimeWarning,
)
sound = pydub.AudioSegment.from_mp3(local_path) sound = pydub.AudioSegment.from_mp3(local_path)
sound.export(temp_path, format="wav") sound.export(temp_path, format="wav")
@@ -1487,26 +1498,34 @@ class MarkItDown:
if exiftool_path is None: if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH") exiftool_path = os.environ.get("EXIFTOOL_PATH")
# Still none? Check well-known paths # Handle deprecation notices
if exiftool_path is None: #############################
candidate = shutil.which("exiftool") if mlm_client is not None:
if candidate: if llm_client is None:
candidate = os.path.abspath(candidate) warn(
if any( "'mlm_client' is deprecated, and was renamed 'llm_client'.",
d == os.path.dirname(candidate) DeprecationWarning,
for d in [ )
"/usr/bin", llm_client = mlm_client
"/usr/local/bin", mlm_client = None
"/opt", else:
"/opt/bin", raise ValueError(
"/opt/local/bin", "'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
"/opt/homebrew/bin", )
"C:\\Windows\\System32",
"C:\\Program Files", if mlm_model is not None:
"C:\\Program Files (x86)", if llm_model is None:
] warn(
): "'mlm_model' is deprecated, and was renamed 'llm_model'.",
exiftool_path = candidate DeprecationWarning,
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client self._llm_client = llm_client
self._llm_model = llm_model self._llm_model = llm_model

View File

@@ -6,6 +6,8 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
@@ -275,6 +277,18 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
) )
def test_markitdown_exiftool() -> None: def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test explicitly setting the location of exiftool # Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool) markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -292,6 +306,40 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif( @pytest.mark.skipif(
skip_llm, skip_llm,
reason="do not run llm tests without a key", reason="do not run llm tests without a key",
@@ -316,4 +364,5 @@ if __name__ == "__main__":
# test_markitdown_remote() # test_markitdown_remote()
# test_markitdown_local() # test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm() # test_markitdown_llm()