Have magika read from the stream. (#1136)
This commit is contained in:
@@ -610,14 +610,16 @@ class MarkItDown:
|
||||
# Call magika to guess from the stream
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
stream_bytes = file_stream.read()
|
||||
|
||||
result = self._magika.identify_bytes(stream_bytes)
|
||||
result = self._magika.identify_stream(file_stream)
|
||||
if result.status == "ok" and result.prediction.output.label != "unknown":
|
||||
# If it's text, also guess the charset
|
||||
charset = None
|
||||
if result.prediction.output.is_text:
|
||||
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
|
||||
# Read the first 4k to guess the charset
|
||||
file_stream.seek(cur_pos)
|
||||
stream_page = file_stream.read(4096)
|
||||
charset_result = charset_normalizer.from_bytes(stream_page).best()
|
||||
|
||||
if charset_result is not None:
|
||||
charset = self._normalize_charset(charset_result.encoding)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user