3 Commits

Author SHA1 Message Date
Adam Fourney
e58bc486ee Added missing comma. 2025-03-07 16:18:47 -08:00
afourney
81ef601c09 Removed deprecation and other warnings. (#1105) 2025-03-07 16:17:03 -08:00
afourney
518b12c1fb Addresses #1068 (#1101) 2025-03-07 15:46:30 -08:00
4 changed files with 34 additions and 107 deletions

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.0.1"
__version__ = "0.0.2"

View File

@@ -3,7 +3,6 @@
# SPDX-License-Identifier: MIT
import argparse
import sys
import shutil
from textwrap import dedent
from .__about__ import __version__
from ._markitdown import MarkItDown, DocumentConverterResult
@@ -75,8 +74,6 @@ def main():
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
which_exiftool = shutil.which("exiftool")
if args.use_docintel:
if args.endpoint is None:
raise ValueError(
@@ -84,11 +81,9 @@ def main():
)
elif args.filename is None:
raise ValueError("Filename is required when using Document Intelligence.")
markitdown = MarkItDown(
exiftool_path=which_exiftool, docintel_endpoint=args.endpoint
)
markitdown = MarkItDown(docintel_endpoint=args.endpoint)
else:
markitdown = MarkItDown(exiftool_path=which_exiftool)
markitdown = MarkItDown()
if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer)

View File

@@ -17,7 +17,7 @@ from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
from warnings import warn, filterwarnings
import mammoth
import markdownify
@@ -51,21 +51,14 @@ mimetypes.add_type("text/csv", ".csv")
# Optional Transcription support
IS_AUDIO_TRANSCRIPTION_CAPABLE = False
filterwarnings("ignore", message=r".*Couldn\'t find ffmpeg or avconv.*", module="pydub")
try:
# Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import pydub
import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError:
pass
finally:
resetwarnings()
# Optional YouTube transcription support
try:
@@ -974,18 +967,6 @@ class MediaConverter(DocumentConverter):
def _get_metadata(self, local_path, exiftool_path=None):
if not exiftool_path:
which_exiftool = shutil.which("exiftool")
if which_exiftool:
warn(
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
md = MarkItDown(exiftool_path="{which_exiftool}")
This warning will be removed in future releases.
""",
DeprecationWarning,
)
return None
else:
try:
@@ -1088,6 +1069,14 @@ class Mp3Converter(WavConverter):
handle, temp_path = tempfile.mkstemp(suffix=".wav")
os.close(handle)
try:
# Check if pydub defaulted to ffmpeg
if pydub.AudioSegment.converter == "ffmpeg" and not shutil.which(
"ffmpeg"
):
warn(
"pydub: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work",
RuntimeWarning,
)
sound = pydub.AudioSegment.from_mp3(local_path)
sound.export(temp_path, format="wav")
@@ -1498,34 +1487,26 @@ class MarkItDown:
if exiftool_path is None:
exiftool_path = os.environ.get("EXIFTOOL_PATH")
# Handle deprecation notices
#############################
if mlm_client is not None:
if llm_client is None:
warn(
"'mlm_client' is deprecated, and was renamed 'llm_client'.",
DeprecationWarning,
)
llm_client = mlm_client
mlm_client = None
else:
raise ValueError(
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
)
if mlm_model is not None:
if llm_model is None:
warn(
"'mlm_model' is deprecated, and was renamed 'llm_model'.",
DeprecationWarning,
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
# Still none? Check well-known paths
if exiftool_path is None:
candidate = shutil.which("exiftool")
if candidate:
candidate = os.path.abspath(candidate)
if any(
d == os.path.dirname(candidate)
for d in [
"/usr/bin",
"/usr/local/bin",
"/opt",
"/opt/bin",
"/opt/local/bin",
"/opt/homebrew/bin",
"C:\\Windows\\System32",
"C:\\Program Files",
"C:\\Program Files (x86)",
]
):
exiftool_path = candidate
self._llm_client = llm_client
self._llm_model = llm_model

View File

@@ -6,8 +6,6 @@ import shutil
import pytest
import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown
skip_remote = (
@@ -277,18 +275,6 @@ def test_markitdown_local() -> None:
reason="do not run if exiftool is not installed",
)
def test_markitdown_exiftool() -> None:
# Test the automatic discovery of exiftool throws a warning
# and is disabled
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown()
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert result.text_content.strip() == ""
finally:
resetwarnings()
# Test explicitly setting the location of exiftool
which_exiftool = shutil.which("exiftool")
markitdown = MarkItDown(exiftool_path=which_exiftool)
@@ -306,40 +292,6 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
@@ -364,5 +316,4 @@ if __name__ == "__main__":
# test_markitdown_remote()
# test_markitdown_local()
test_markitdown_exiftool()
# test_markitdown_deprecation()
# test_markitdown_llm()