If puremagic has no guesses, try again after ltrim. (#260)

This commit is contained in:
afourney
2025-01-03 16:03:11 -08:00
committed by GitHub
parent 731b39e7f5
commit 436407288f
2 changed files with 20 additions and 1 deletions

View File

@@ -1594,6 +1594,25 @@ class MarkItDown:
# Use puremagic to guess
try:
guesses = puremagic.magic_file(path)
# Fix for: https://github.com/microsoft/markitdown/issues/222
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
# (space, tab, newline, carriage return, vertical tab, form feed).
if len(guesses) == 0:
with open(path, "rb") as file:
while True:
char = file.read(1)
if not char: # End of file
break
if not char.isspace():
file.seek(file.tell() - 1)
break
try:
guesses = puremagic.magic_stream(file)
except puremagic.main.PureError:
pass
extensions = list()
for g in guesses:
ext = g.extension.strip()

View File

@@ -259,7 +259,7 @@ def test_markitdown_local() -> None:
# Test input with leading blank characters
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
result = markitdown.convert_stream(io.BytesIO(input_data))
assert "# Test" in result.text_content