Have magika read from the stream. (#1136)
This commit is contained in:
@@ -27,7 +27,7 @@ dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"markdownify",
|
||||
"magika>=0.6.1rc2",
|
||||
"magika>=0.6.1rc3",
|
||||
"charset-normalizer",
|
||||
]
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.0a3"
|
||||
__version__ = "0.1.0a4"
|
||||
|
||||
@@ -610,14 +610,16 @@ class MarkItDown:
|
||||
# Call magika to guess from the stream
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
stream_bytes = file_stream.read()
|
||||
|
||||
result = self._magika.identify_bytes(stream_bytes)
|
||||
result = self._magika.identify_stream(file_stream)
|
||||
if result.status == "ok" and result.prediction.output.label != "unknown":
|
||||
# If it's text, also guess the charset
|
||||
charset = None
|
||||
if result.prediction.output.is_text:
|
||||
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
|
||||
# Read the first 4k to guess the charset
|
||||
file_stream.seek(cur_pos)
|
||||
stream_page = file_stream.read(4096)
|
||||
charset_result = charset_normalizer.from_bytes(stream_page).best()
|
||||
|
||||
if charset_result is not None:
|
||||
charset = self._normalize_charset(charset_result.encoding)
|
||||
|
||||
|
||||
@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
|
||||
# mimetype or extension, so we'll special-case them here.
|
||||
if test_vector.filename in [
|
||||
"test_outlook_msg.msg",
|
||||
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
|
||||
]:
|
||||
return
|
||||
|
||||
@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
|
||||
"""Test the conversion of a stream with no stream info."""
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# For some limited exceptions, we can't guarantee the exact
|
||||
# mimetype or extension, so we'll special-case them here.
|
||||
if test_vector.filename in [
|
||||
# This appears to be a subtle bug in magika.
|
||||
# See: https://github.com/google/magika/issues/983
|
||||
"test_mskanji.csv",
|
||||
]:
|
||||
return
|
||||
|
||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||
result = markitdown.convert(stream, url=test_vector.url)
|
||||
for string in test_vector.must_include:
|
||||
|
||||
Reference in New Issue
Block a user