Have magika read from the stream. (#1136)

This commit is contained in:
afourney
2025-03-17 07:39:19 -07:00
committed by GitHub
parent 53834fdd24
commit c5f70b904f
4 changed files with 8 additions and 16 deletions

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"markdownify", "markdownify",
"magika>=0.6.1rc2", "magika>=0.6.1rc3",
"charset-normalizer", "charset-normalizer",
] ]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.1.0a3" __version__ = "0.1.0a4"

View File

@@ -610,14 +610,16 @@ class MarkItDown:
# Call magika to guess from the stream # Call magika to guess from the stream
cur_pos = file_stream.tell() cur_pos = file_stream.tell()
try: try:
stream_bytes = file_stream.read() result = self._magika.identify_stream(file_stream)
result = self._magika.identify_bytes(stream_bytes)
if result.status == "ok" and result.prediction.output.label != "unknown": if result.status == "ok" and result.prediction.output.label != "unknown":
# If it's text, also guess the charset # If it's text, also guess the charset
charset = None charset = None
if result.prediction.output.is_text: if result.prediction.output.is_text:
charset_result = charset_normalizer.from_bytes(stream_bytes).best() # Read the first 4k to guess the charset
file_stream.seek(cur_pos)
stream_page = file_stream.read(4096)
charset_result = charset_normalizer.from_bytes(stream_page).best()
if charset_result is not None: if charset_result is not None:
charset = self._normalize_charset(charset_result.encoding) charset = self._normalize_charset(charset_result.encoding)

View File

@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
# mimetype or extension, so we'll special-case them here. # mimetype or extension, so we'll special-case them here.
if test_vector.filename in [ if test_vector.filename in [
"test_outlook_msg.msg", "test_outlook_msg.msg",
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
]: ]:
return return
@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
"""Test the conversion of a stream with no stream info.""" """Test the conversion of a stream with no stream info."""
markitdown = MarkItDown() markitdown = MarkItDown()
# For some limited exceptions, we can't guarantee the exact
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
# This appears to be a subtle bug in magika.
# See: https://github.com/google/magika/issues/983
"test_mskanji.csv",
]:
return
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(stream, url=test_vector.url) result = markitdown.convert(stream, url=test_vector.url)
for string in test_vector.must_include: for string in test_vector.must_include: