Enhance type guessing.
This commit is contained in:
@@ -569,6 +569,23 @@ class MarkItDown:
|
|||||||
"""
|
"""
|
||||||
guesses: List[StreamInfo] = []
|
guesses: List[StreamInfo] = []
|
||||||
|
|
||||||
|
# Enhance the base guess with information based on the extension or mimetype
|
||||||
|
enhanced_guess = base_guess.copy_and_update()
|
||||||
|
|
||||||
|
# If there's an extension and no mimetype, try to guess the mimetype
|
||||||
|
if base_guess.mimetype is None and base_guess.extension is not None:
|
||||||
|
_m, _ = mimetypes.guess_type(
|
||||||
|
"placeholder" + base_guess.extension, strict=False
|
||||||
|
)
|
||||||
|
if _m is not None:
|
||||||
|
enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m)
|
||||||
|
|
||||||
|
# If there's a mimetype and no extension, try to guess the extension
|
||||||
|
if base_guess.mimetype is not None and base_guess.extension is None:
|
||||||
|
_e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False)
|
||||||
|
if len(_e) > 0:
|
||||||
|
enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0])
|
||||||
|
|
||||||
# Call magika to guess from the stream
|
# Call magika to guess from the stream
|
||||||
cur_pos = file_stream.tell()
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
@@ -624,7 +641,7 @@ class MarkItDown:
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# The magika guess was incompatible with the base guess, so add both guesses
|
# The magika guess was incompatible with the base guess, so add both guesses
|
||||||
guesses.append(base_guess)
|
guesses.append(enhanced_guess)
|
||||||
guesses.append(
|
guesses.append(
|
||||||
StreamInfo(
|
StreamInfo(
|
||||||
mimetype=result.prediction.output.mime_type,
|
mimetype=result.prediction.output.mime_type,
|
||||||
@@ -637,7 +654,7 @@ class MarkItDown:
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# There were no other guesses, so just add the base guess
|
# There were no other guesses, so just add the base guess
|
||||||
guesses.append(base_guess)
|
guesses.append(enhanced_guess)
|
||||||
finally:
|
finally:
|
||||||
file_stream.seek(cur_pos)
|
file_stream.seek(cur_pos)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user