If puremagic has no guesses, try again after ltrim. (#260)
This commit is contained in:
@@ -1594,6 +1594,25 @@ class MarkItDown:
|
|||||||
# Use puremagic to guess
|
# Use puremagic to guess
|
||||||
try:
|
try:
|
||||||
guesses = puremagic.magic_file(path)
|
guesses = puremagic.magic_file(path)
|
||||||
|
|
||||||
|
# Fix for: https://github.com/microsoft/markitdown/issues/222
|
||||||
|
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
|
||||||
|
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
|
||||||
|
# (space, tab, newline, carriage return, vertical tab, form feed).
|
||||||
|
if len(guesses) == 0:
|
||||||
|
with open(path, "rb") as file:
|
||||||
|
while True:
|
||||||
|
char = file.read(1)
|
||||||
|
if not char: # End of file
|
||||||
|
break
|
||||||
|
if not char.isspace():
|
||||||
|
file.seek(file.tell() - 1)
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
guesses = puremagic.magic_stream(file)
|
||||||
|
except puremagic.main.PureError:
|
||||||
|
pass
|
||||||
|
|
||||||
extensions = list()
|
extensions = list()
|
||||||
for g in guesses:
|
for g in guesses:
|
||||||
ext = g.extension.strip()
|
ext = g.extension.strip()
|
||||||
|
|||||||
@@ -259,7 +259,7 @@ def test_markitdown_local() -> None:
|
|||||||
|
|
||||||
# Test input with leading blank characters
|
# Test input with leading blank characters
|
||||||
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
|
||||||
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
|
result = markitdown.convert_stream(io.BytesIO(input_data))
|
||||||
assert "# Test" in result.text_content
|
assert "# Test" in result.text_content
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user