Fix continue trying on errors.

This commit is contained in:
Adam Fourney
2024-11-14 10:23:40 -08:00
parent 997c7af53c
commit 2eab564c4c

View File

@@ -11,6 +11,7 @@ import shutil
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import traceback
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
@@ -913,7 +914,9 @@ class MarkItDown:
# Get extension alternatives from the path and puremagic # Get extension alternatives from the path and puremagic
base, ext = os.path.splitext(path) base, ext = os.path.splitext(path)
self._append_ext(extensions, ext) self._append_ext(extensions, ext)
self._append_ext(extensions, self._guess_ext_magic(path))
for g in self._guess_ext_magic(path):
self._append_ext(extensions, g)
# Convert # Convert
return self._convert(path, extensions, **kwargs) return self._convert(path, extensions, **kwargs)
@@ -940,7 +943,8 @@ class MarkItDown:
fh.close() fh.close()
# Use puremagic to check for more extension options # Use puremagic to check for more extension options
self._append_ext(extensions, self._guess_ext_magic(temp_path)) for g in self._guess_ext_magic(temp_path):
self._append_ext(extensions, g)
# Convert # Convert
result = self._convert(temp_path, extensions, **kwargs) result = self._convert(temp_path, extensions, **kwargs)
@@ -1032,10 +1036,10 @@ class MarkItDown:
_kwargs["mlm_model"] = self._mlm_model _kwargs["mlm_model"] = self._mlm_model
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
# try: try:
res = converter.convert(local_path, **_kwargs) res = converter.convert(local_path, **_kwargs)
# except Exception: except Exception:
# error_trace = ("\n\n" + traceback.format_exc()).strip() error_trace = ("\n\n" + traceback.format_exc()).strip()
if res is not None: if res is not None:
# Normalize the content # Normalize the content
@@ -1074,10 +1078,15 @@ class MarkItDown:
# Use puremagic to guess # Use puremagic to guess
try: try:
guesses = puremagic.magic_file(path) guesses = puremagic.magic_file(path)
if len(guesses) > 0: extensions = list()
ext = guesses[0].extension.strip() for g in guesses:
ext = g.extension.strip()
if len(ext) > 0: if len(ext) > 0:
return ext if not ext.startswith("."):
ext = "." + ext
if ext not in extensions:
extensions.append(ext)
return extensions
except FileNotFoundError: except FileNotFoundError:
pass pass
except IsADirectoryError: except IsADirectoryError: