Have magika read from the stream. (#1136)

This commit is contained in:
afourney
2025-03-17 07:39:19 -07:00
committed by GitHub
parent 53834fdd24
commit c5f70b904f
4 changed files with 8 additions and 16 deletions

View File

@@ -610,14 +610,16 @@ class MarkItDown:
# Call magika to guess from the stream
cur_pos = file_stream.tell()
try:
stream_bytes = file_stream.read()
result = self._magika.identify_bytes(stream_bytes)
result = self._magika.identify_stream(file_stream)
if result.status == "ok" and result.prediction.output.label != "unknown":
# If it's text, also guess the charset
charset = None
if result.prediction.output.is_text:
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
# Read the first 4k to guess the charset
file_stream.seek(cur_pos)
stream_page = file_stream.read(4096)
charset_result = charset_normalizer.from_bytes(stream_page).best()
if charset_result is not None:
charset = self._normalize_charset(charset_result.encoding)