Set exiftool path explicitly. (#267)
This commit is contained in:
@@ -892,14 +892,25 @@ class MediaConverter(DocumentConverter):
|
|||||||
Abstract class for multi-modal media (e.g., images and audio)
|
Abstract class for multi-modal media (e.g., images and audio)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _get_metadata(self, local_path):
|
def _get_metadata(self, local_path, exiftool_path=None):
|
||||||
exiftool = shutil.which("exiftool")
|
if not exiftool_path:
|
||||||
if not exiftool:
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
if which_exiftool:
|
||||||
|
warn(
|
||||||
|
f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g.,
|
||||||
|
|
||||||
|
md = MarkItDown(exiftool_path="{which_exiftool}")
|
||||||
|
|
||||||
|
This warning will be removed in future releases.
|
||||||
|
""",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[exiftool, "-json", local_path], capture_output=True, text=True
|
[exiftool_path, "-json", local_path], capture_output=True, text=True
|
||||||
).stdout
|
).stdout
|
||||||
return json.loads(result)[0]
|
return json.loads(result)[0]
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -920,7 +931,7 @@ class WavConverter(MediaConverter):
|
|||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata
|
||||||
metadata = self._get_metadata(local_path)
|
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||||
if metadata:
|
if metadata:
|
||||||
for f in [
|
for f in [
|
||||||
"Title",
|
"Title",
|
||||||
@@ -975,7 +986,7 @@ class Mp3Converter(WavConverter):
|
|||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata
|
||||||
metadata = self._get_metadata(local_path)
|
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||||
if metadata:
|
if metadata:
|
||||||
for f in [
|
for f in [
|
||||||
"Title",
|
"Title",
|
||||||
@@ -1036,7 +1047,7 @@ class ImageConverter(MediaConverter):
|
|||||||
md_content = ""
|
md_content = ""
|
||||||
|
|
||||||
# Add metadata
|
# Add metadata
|
||||||
metadata = self._get_metadata(local_path)
|
metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
|
||||||
if metadata:
|
if metadata:
|
||||||
for f in [
|
for f in [
|
||||||
"ImageSize",
|
"ImageSize",
|
||||||
@@ -1325,6 +1336,7 @@ class MarkItDown:
|
|||||||
llm_client: Optional[Any] = None,
|
llm_client: Optional[Any] = None,
|
||||||
llm_model: Optional[str] = None,
|
llm_model: Optional[str] = None,
|
||||||
style_map: Optional[str] = None,
|
style_map: Optional[str] = None,
|
||||||
|
exiftool_path: Optional[str] = None,
|
||||||
# Deprecated
|
# Deprecated
|
||||||
mlm_client: Optional[Any] = None,
|
mlm_client: Optional[Any] = None,
|
||||||
mlm_model: Optional[str] = None,
|
mlm_model: Optional[str] = None,
|
||||||
@@ -1334,6 +1346,9 @@ class MarkItDown:
|
|||||||
else:
|
else:
|
||||||
self._requests_session = requests_session
|
self._requests_session = requests_session
|
||||||
|
|
||||||
|
if exiftool_path is None:
|
||||||
|
exiftool_path = os.environ.get("EXIFTOOL_PATH")
|
||||||
|
|
||||||
# Handle deprecation notices
|
# Handle deprecation notices
|
||||||
#############################
|
#############################
|
||||||
if mlm_client is not None:
|
if mlm_client is not None:
|
||||||
@@ -1366,6 +1381,7 @@ class MarkItDown:
|
|||||||
self._llm_client = llm_client
|
self._llm_client = llm_client
|
||||||
self._llm_model = llm_model
|
self._llm_model = llm_model
|
||||||
self._style_map = style_map
|
self._style_map = style_map
|
||||||
|
self._exiftool_path = exiftool_path
|
||||||
|
|
||||||
self._page_converters: List[DocumentConverter] = []
|
self._page_converters: List[DocumentConverter] = []
|
||||||
|
|
||||||
@@ -1549,12 +1565,15 @@ class MarkItDown:
|
|||||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||||
_kwargs["llm_model"] = self._llm_model
|
_kwargs["llm_model"] = self._llm_model
|
||||||
|
|
||||||
# Add the list of converters for nested processing
|
|
||||||
_kwargs["_parent_converters"] = self._page_converters
|
|
||||||
|
|
||||||
if "style_map" not in _kwargs and self._style_map is not None:
|
if "style_map" not in _kwargs and self._style_map is not None:
|
||||||
_kwargs["style_map"] = self._style_map
|
_kwargs["style_map"] = self._style_map
|
||||||
|
|
||||||
|
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
|
||||||
|
_kwargs["exiftool_path"] = self._exiftool_path
|
||||||
|
|
||||||
|
# Add the list of converters for nested processing
|
||||||
|
_kwargs["_parent_converters"] = self._page_converters
|
||||||
|
|
||||||
# If we hit an error log it and keep trying
|
# If we hit an error log it and keep trying
|
||||||
try:
|
try:
|
||||||
res = converter.convert(local_path, **_kwargs)
|
res = converter.convert(local_path, **_kwargs)
|
||||||
|
|||||||
@@ -277,9 +277,29 @@ def test_markitdown_local() -> None:
|
|||||||
reason="do not run if exiftool is not installed",
|
reason="do not run if exiftool is not installed",
|
||||||
)
|
)
|
||||||
def test_markitdown_exiftool() -> None:
|
def test_markitdown_exiftool() -> None:
|
||||||
markitdown = MarkItDown()
|
# Test the automatic discovery of exiftool throws a warning
|
||||||
|
# and is disabled
|
||||||
|
try:
|
||||||
|
with catch_warnings(record=True) as w:
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||||
|
assert len(w) == 1
|
||||||
|
assert w[0].category is DeprecationWarning
|
||||||
|
assert result.text_content.strip() == ""
|
||||||
|
finally:
|
||||||
|
resetwarnings()
|
||||||
|
|
||||||
# Test JPG metadata processing
|
# Test explicitly setting the location of exiftool
|
||||||
|
which_exiftool = shutil.which("exiftool")
|
||||||
|
markitdown = MarkItDown(exiftool_path=which_exiftool)
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||||
|
for key in JPG_TEST_EXIFTOOL:
|
||||||
|
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||||
|
assert target in result.text_content
|
||||||
|
|
||||||
|
# Test setting the exiftool path through an environment variable
|
||||||
|
os.environ["EXIFTOOL_PATH"] = which_exiftool
|
||||||
|
markitdown = MarkItDown()
|
||||||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
|
||||||
for key in JPG_TEST_EXIFTOOL:
|
for key in JPG_TEST_EXIFTOOL:
|
||||||
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
|
||||||
@@ -341,8 +361,8 @@ def test_markitdown_llm() -> None:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
# test_markitdown_remote()
|
||||||
test_markitdown_local()
|
# test_markitdown_local()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
test_markitdown_deprecation()
|
# test_markitdown_deprecation()
|
||||||
test_markitdown_llm()
|
# test_markitdown_llm()
|
||||||
|
|||||||
Reference in New Issue
Block a user