Have magika read from the stream. (#1136)

This commit is contained in:
afourney
2025-03-17 07:39:19 -07:00
committed by GitHub
parent 53834fdd24
commit c5f70b904f
4 changed files with 8 additions and 16 deletions

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4",
"requests",
"markdownify",
"magika>=0.6.1rc2",
"magika>=0.6.1rc3",
"charset-normalizer",
]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.0a3"
__version__ = "0.1.0a4"

View File

@@ -610,14 +610,16 @@ class MarkItDown:
# Call magika to guess from the stream
cur_pos = file_stream.tell()
try:
stream_bytes = file_stream.read()
result = self._magika.identify_bytes(stream_bytes)
result = self._magika.identify_stream(file_stream)
if result.status == "ok" and result.prediction.output.label != "unknown":
# If it's text, also guess the charset
charset = None
if result.prediction.output.is_text:
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
# Read the first 4k to guess the charset
file_stream.seek(cur_pos)
stream_page = file_stream.read(4096)
charset_result = charset_normalizer.from_bytes(stream_page).best()
if charset_result is not None:
charset = self._normalize_charset(charset_result.encoding)

View File

@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
"test_outlook_msg.msg",
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
]:
return
@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()
# For some limited exceptions, we can't guarantee the exact
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
# This appears to be a subtle bug in magika.
# See: https://github.com/google/magika/issues/983
"test_mskanji.csv",
]:
return
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(stream, url=test_vector.url)
for string in test_vector.must_include: