Print and log better exceptions when file conversions fail. (#1080)
* Print and log better exceptions when file conversions fail. * Added unit tests for exceptions.
This commit is contained in:
@@ -7,6 +7,7 @@ from ._markitdown import MarkItDown
|
|||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
MarkItDownException,
|
MarkItDownException,
|
||||||
ConverterPrerequisiteException,
|
ConverterPrerequisiteException,
|
||||||
|
FailedConversionAttempt,
|
||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
)
|
)
|
||||||
@@ -19,6 +20,7 @@ __all__ = [
|
|||||||
"DocumentConverterResult",
|
"DocumentConverterResult",
|
||||||
"MarkItDownException",
|
"MarkItDownException",
|
||||||
"ConverterPrerequisiteException",
|
"ConverterPrerequisiteException",
|
||||||
|
"FailedConversionAttempt",
|
||||||
"FileConversionException",
|
"FileConversionException",
|
||||||
"UnsupportedFormatException",
|
"UnsupportedFormatException",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,3 +1,6 @@
|
|||||||
|
from typing import Optional, List, Any
|
||||||
|
|
||||||
|
|
||||||
class MarkItDownException(BaseException):
|
class MarkItDownException(BaseException):
|
||||||
"""
|
"""
|
||||||
Base exception class for MarkItDown.
|
Base exception class for MarkItDown.
|
||||||
@@ -20,18 +23,43 @@ class ConverterPrerequisiteException(MarkItDownException):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class FileConversionException(MarkItDownException):
|
|
||||||
"""
|
|
||||||
Thrown when a suitable converter was found, but the conversion
|
|
||||||
process fails for any reason.
|
|
||||||
"""
|
|
||||||
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class UnsupportedFormatException(MarkItDownException):
|
class UnsupportedFormatException(MarkItDownException):
|
||||||
"""
|
"""
|
||||||
Thrown when no suitable converter was found for the given file.
|
Thrown when no suitable converter was found for the given file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FailedConversionAttempt(object):
|
||||||
|
"""
|
||||||
|
Represents an a single attempt to convert a file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
|
||||||
|
self.converter = converter
|
||||||
|
self.exc_info = exc_info
|
||||||
|
|
||||||
|
|
||||||
|
class FileConversionException(MarkItDownException):
|
||||||
|
"""
|
||||||
|
Thrown when a suitable converter was found, but the conversion
|
||||||
|
process fails for any reason.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: Optional[str] = None,
|
||||||
|
attempts: Optional[List[FailedConversionAttempt]] = None,
|
||||||
|
):
|
||||||
|
self.attempts = attempts
|
||||||
|
|
||||||
|
if message is None:
|
||||||
|
if attempts is None:
|
||||||
|
message = "File conversion failed."
|
||||||
|
else:
|
||||||
|
message = f"File conversion failed after {len(attempts)} attempts:\n"
|
||||||
|
for attempt in attempts:
|
||||||
|
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
|
||||||
|
|
||||||
|
super().__init__(message)
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import copy
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
import traceback
|
import traceback
|
||||||
@@ -42,6 +43,7 @@ from ._exceptions import (
|
|||||||
FileConversionException,
|
FileConversionException,
|
||||||
UnsupportedFormatException,
|
UnsupportedFormatException,
|
||||||
ConverterPrerequisiteException,
|
ConverterPrerequisiteException,
|
||||||
|
FailedConversionAttempt,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Override mimetype for csv to fix issue on windows
|
# Override mimetype for csv to fix issue on windows
|
||||||
@@ -313,7 +315,9 @@ class MarkItDown:
|
|||||||
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
self, local_path: str, extensions: List[Union[str, None]], **kwargs
|
||||||
) -> DocumentConverterResult:
|
) -> DocumentConverterResult:
|
||||||
res: Union[None, DocumentConverterResult] = None
|
res: Union[None, DocumentConverterResult] = None
|
||||||
error_trace = ""
|
|
||||||
|
# Keep track of which converters throw exceptions
|
||||||
|
failed_attempts: List[FailedConversionAttempt] = []
|
||||||
|
|
||||||
# Create a copy of the page_converters list, sorted by priority.
|
# Create a copy of the page_converters list, sorted by priority.
|
||||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||||
@@ -351,7 +355,11 @@ class MarkItDown:
|
|||||||
try:
|
try:
|
||||||
res = converter.convert(local_path, **_kwargs)
|
res = converter.convert(local_path, **_kwargs)
|
||||||
except Exception:
|
except Exception:
|
||||||
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
failed_attempts.append(
|
||||||
|
FailedConversionAttempt(
|
||||||
|
converter=converter, exc_info=sys.exc_info()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if res is not None:
|
if res is not None:
|
||||||
# Normalize the content
|
# Normalize the content
|
||||||
@@ -364,14 +372,12 @@ class MarkItDown:
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
# If we got this far without success, report any exceptions
|
# If we got this far without success, report any exceptions
|
||||||
if len(error_trace) > 0:
|
if len(failed_attempts) > 0:
|
||||||
raise FileConversionException(
|
raise FileConversionException(attempts=failed_attempts)
|
||||||
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Nothing can handle it!
|
# Nothing can handle it!
|
||||||
raise UnsupportedFormatException(
|
raise UnsupportedFormatException(
|
||||||
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
|
f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _append_ext(self, extensions, ext):
|
def _append_ext(self, extensions, ext):
|
||||||
|
|||||||
BIN
packages/markitdown/tests/test_files/random.bin
Normal file
BIN
packages/markitdown/tests/test_files/random.bin
Normal file
Binary file not shown.
@@ -8,7 +8,7 @@ import requests
|
|||||||
|
|
||||||
from warnings import catch_warnings, resetwarnings
|
from warnings import catch_warnings, resetwarnings
|
||||||
|
|
||||||
from markitdown import MarkItDown
|
from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException
|
||||||
|
|
||||||
skip_remote = (
|
skip_remote = (
|
||||||
True if os.environ.get("GITHUB_ACTIONS") else False
|
True if os.environ.get("GITHUB_ACTIONS") else False
|
||||||
@@ -272,6 +272,21 @@ def test_markitdown_local() -> None:
|
|||||||
assert "# Test" in result.text_content
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
def test_exceptions() -> None:
|
||||||
|
# Check that an exception is raised when trying to convert an unsupported format
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
with pytest.raises(UnsupportedFormatException):
|
||||||
|
markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin"))
|
||||||
|
|
||||||
|
# Check that an exception is raised when trying to convert a file that is corrupted
|
||||||
|
with pytest.raises(FileConversionException) as exc_info:
|
||||||
|
markitdown.convert(
|
||||||
|
os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx"
|
||||||
|
)
|
||||||
|
assert len(exc_info.value.attempts) == 1
|
||||||
|
assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
skip_exiftool,
|
skip_exiftool,
|
||||||
reason="do not run if exiftool is not installed",
|
reason="do not run if exiftool is not installed",
|
||||||
@@ -329,6 +344,7 @@ if __name__ == "__main__":
|
|||||||
"""Runs this file's tests from the command line."""
|
"""Runs this file's tests from the command line."""
|
||||||
test_markitdown_remote()
|
test_markitdown_remote()
|
||||||
test_markitdown_local()
|
test_markitdown_local()
|
||||||
|
test_exceptions()
|
||||||
test_markitdown_exiftool()
|
test_markitdown_exiftool()
|
||||||
# test_markitdown_llm()
|
# test_markitdown_llm()
|
||||||
print("All tests passed!")
|
print("All tests passed!")
|
||||||
|
|||||||
Reference in New Issue
Block a user