Print and log better exceptions when file conversions fail. (#1080)

* Print and log better exceptions when file conversions fail.
* Added unit tests for exceptions.
This commit is contained in:
afourney
2025-02-28 16:07:47 -08:00
committed by GitHub
parent 9182923375
commit 43bd79adc9
5 changed files with 69 additions and 17 deletions

View File

@@ -7,6 +7,7 @@ from ._markitdown import MarkItDown
from ._exceptions import (
MarkItDownException,
ConverterPrerequisiteException,
FailedConversionAttempt,
FileConversionException,
UnsupportedFormatException,
)
@@ -19,6 +20,7 @@ __all__ = [
"DocumentConverterResult",
"MarkItDownException",
"ConverterPrerequisiteException",
"FailedConversionAttempt",
"FileConversionException",
"UnsupportedFormatException",
]

View File

@@ -1,3 +1,6 @@
from typing import Optional, List, Any
class MarkItDownException(BaseException):
"""
Base exception class for MarkItDown.
@@ -20,18 +23,43 @@ class ConverterPrerequisiteException(MarkItDownException):
pass
class FileConversionException(MarkItDownException):
"""
Thrown when a suitable converter was found, but the conversion
process fails for any reason.
"""
pass
class UnsupportedFormatException(MarkItDownException):
"""
Thrown when no suitable converter was found for the given file.
"""
pass
class FailedConversionAttempt(object):
"""
Represents an a single attempt to convert a file.
"""
def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
self.converter = converter
self.exc_info = exc_info
class FileConversionException(MarkItDownException):
"""
Thrown when a suitable converter was found, but the conversion
process fails for any reason.
"""
def __init__(
self,
message: Optional[str] = None,
attempts: Optional[List[FailedConversionAttempt]] = None,
):
self.attempts = attempts
if message is None:
if attempts is None:
message = "File conversion failed."
else:
message = f"File conversion failed after {len(attempts)} attempts:\n"
for attempt in attempts:
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
super().__init__(message)

View File

@@ -2,6 +2,7 @@ import copy
import mimetypes
import os
import re
import sys
import tempfile
import warnings
import traceback
@@ -42,6 +43,7 @@ from ._exceptions import (
FileConversionException,
UnsupportedFormatException,
ConverterPrerequisiteException,
FailedConversionAttempt,
)
# Override mimetype for csv to fix issue on windows
@@ -313,7 +315,9 @@ class MarkItDown:
self, local_path: str, extensions: List[Union[str, None]], **kwargs
) -> DocumentConverterResult:
res: Union[None, DocumentConverterResult] = None
error_trace = ""
# Keep track of which converters throw exceptions
failed_attempts: List[FailedConversionAttempt] = []
# Create a copy of the page_converters list, sorted by priority.
# We do this with each call to _convert because the priority of converters may change between calls.
@@ -351,7 +355,11 @@ class MarkItDown:
try:
res = converter.convert(local_path, **_kwargs)
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()
failed_attempts.append(
FailedConversionAttempt(
converter=converter, exc_info=sys.exc_info()
)
)
if res is not None:
# Normalize the content
@@ -364,14 +372,12 @@ class MarkItDown:
return res
# If we got this far without success, report any exceptions
if len(error_trace) > 0:
raise FileConversionException(
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
)
if len(failed_attempts) > 0:
raise FileConversionException(attempts=failed_attempts)
# Nothing can handle it!
raise UnsupportedFormatException(
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
f"Could not convert '{local_path}' to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
)
def _append_ext(self, extensions, ext):

Binary file not shown.

View File

@@ -8,7 +8,7 @@ import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown
from markitdown import MarkItDown, UnsupportedFormatException, FileConversionException
skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False
@@ -272,6 +272,21 @@ def test_markitdown_local() -> None:
assert "# Test" in result.text_content
def test_exceptions() -> None:
# Check that an exception is raised when trying to convert an unsupported format
markitdown = MarkItDown()
with pytest.raises(UnsupportedFormatException):
markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin"))
# Check that an exception is raised when trying to convert a file that is corrupted
with pytest.raises(FileConversionException) as exc_info:
markitdown.convert(
os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx"
)
assert len(exc_info.value.attempts) == 1
assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter"
@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
@@ -329,6 +344,7 @@ if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_markitdown_remote()
test_markitdown_local()
test_exceptions()
test_markitdown_exiftool()
# test_markitdown_llm()
print("All tests passed!")