6 Commits

Author SHA1 Message Date
Adam Fourney
e58bc486ee Added missing comma. 2025-03-07 16:18:47 -08:00
afourney
81ef601c09 Removed deprecation and other warnings. (#1105) 2025-03-07 16:17:03 -08:00
afourney
518b12c1fb Addresses #1068 (#1101) 2025-03-07 15:46:30 -08:00
Adam Fourney
8eaf5a1da9 Clean up README.md 2025-03-05 21:35:08 -08:00
afourney
38c924793c Bump version (#1095) 2025-03-05 21:30:56 -08:00
afourney
b9526d5e47 Bump version. (#1075) 2025-02-28 07:30:46 -08:00
5 changed files with 34 additions and 143 deletions

View File

@@ -87,42 +87,6 @@ print(result.text_content)
docker build -t markitdown:latest . docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
``` ```
<details>
<summary>Batch Processing Multiple Files</summary>
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
```python convert.py
from markitdown import MarkItDown
from openai import OpenAI
import os
client = OpenAI(api_key="your-api-key-here")
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
for file in files_to_convert:
print(f"\nConverting {file}...")
try:
md_file = os.path.splitext(file)[0] + '.md'
result = md.convert(file)
with open(md_file, 'w') as f:
f.write(result.text_content)
print(f"Successfully converted {file} to {md_file}")
except Exception as e:
print(f"Error converting {file}: {str(e)}")
print("\nAll conversions completed!")
```
2. Place the script in the same directory as your files
3. Install required packages: like openai
4. Run script ```bash python convert.py ```
Note that original files will remain unchanged and new markdown files are created with the same base name.
</details>
## Contributing ## Contributing

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a4" __version__ = "0.0.2"

View File

@@ -3,7 +3,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
import argparse import argparse
import sys import sys
import shutil
from textwrap import dedent from textwrap import dedent
from .__about__ import __version__ from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult from ._markitdown import MarkItDown, DocumentConverterResult
@@ -75,8 +74,6 @@ def main():
parser.add_argument("filename", nargs="?") parser.add_argument("filename", nargs="?")
args = parser.parse_args() args = parser.parse_args()
which_exiftool = shutil.which("exiftool")
if args.use_docintel: if args.use_docintel:
if args.endpoint is None: if args.endpoint is None:
raise ValueError( raise ValueError(
@@ -84,11 +81,9 @@ def main():
) )
elif args.filename is None: elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.") raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown( markitdown = MarkItDown(docintel_endpoint=args.endpoint)
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
)
else: else:
markitdown = MarkItDown(exiftool_path=which_exiftool) markitdown = MarkItDown()
if args.filename is None: if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer) result = markitdown.convert_stream(sys.stdin.buffer)

View File

@@ -17,7 +17,7 @@ from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from pathlib import Path from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings from warnings import warn, filterwarnings
import mammoth import mammoth
import markdownify import markdownify
@@ -51,21 +51,14 @@ mimetypes.add_type("text/csv", ".csv")
# Optional Transcription support # Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False IS_AUDIO_TRANSCRIPTION_CAPABLE = False
filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
try: try:
# Using warnings' catch_warnings to catch import pydub
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@@ -974,18 +967,6 @@ class MediaConverter(DocumentConverter):
def _get_metadata(self, local_path, exiftool_path=None): def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path: if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None return None
else: else:
try: try:
@@ -1088,6 +1069,14 @@ class Mp3Converter(WavConverter):
handle, temp_path = tempfile.mkstemp(suffix=".wav") handle, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(handle) os.close(handle)
try: try:
# Check if pydub defaulted to ffmpeg
if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
"ffmpeg"
):
warn(
"pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
RuntimeWarning,
)
sound = pydub.AudioSegment.from_mp3(local_path) sound = pydub.AudioSegment.from_mp3(local_path)
sound.export(temp_path, format="wav") sound.export(temp_path, format="wav")
@@ -1498,34 +1487,26 @@ class MarkItDown:
if exiftool_path is None: if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH") exiftool_path = os.environ.get("EXIFTOOL_PATH")
# Handle deprecation notices # Still none? Check well-known paths
############################# if exiftool_path is None:
if mlm_client is not None: candidate = shutil.which("exiftool")
if llm_client is None: if candidate:
warn( candidate = os.path.abspath(candidate)
"'mlm_client' is deprecated, and was renamed 'llm_client'.", if any(
DeprecationWarning, d == os.path.dirname(candidate)
) for d in [
llm_client = mlm_client "/usr/bin",
mlm_client = None "/usr/local/bin",
else: "/opt",
raise ValueError( "/opt/bin",
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead." "/opt/local/bin",
) "/opt/homebrew/bin",
"C:\\Windows\\System32",
if mlm_model is not None: "C:\\Program Files",
if llm_model is None: "C:\\Program Files (x86)",
warn( ]
"'mlm_model' is deprecated, and was renamed 'llm_model'.", ):
DeprecationWarning, exiftool_path = candidate
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client self._llm_client = llm_client
self._llm_model = llm_model self._llm_model = llm_model

View File

@@ -6,8 +6,6 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
@@ -277,18 +275,6 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed", reason="do not run if exiftool is not installed",
) )
def test_markitdown_exiftool() -> None: def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test explicitly setting the location of exiftool # Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool") which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool) markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -306,40 +292,6 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif( @pytest.mark.skipif(
skip_llm, skip_llm,
reason="do not run llm tests without a key", reason="do not run llm tests without a key",
@@ -364,5 +316,4 @@ if __name__ == "__main__":
# test_markitdown_remote() # test_markitdown_remote()
# test_markitdown_local() # test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm() # test_markitdown_llm()