Have magika read from the stream. (#1136)
This commit is contained in:
@@ -27,7 +27,7 @@ dependencies = [
|
|||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"requests",
|
"requests",
|
||||||
"markdownify",
|
"markdownify",
|
||||||
"magika>=0.6.1rc2",
|
"magika>=0.6.1rc3",
|
||||||
"charset-normalizer",
|
"charset-normalizer",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.1.0a3"
|
__version__ = "0.1.0a4"
|
||||||
|
|||||||
@@ -610,14 +610,16 @@ class MarkItDown:
|
|||||||
# Call magika to guess from the stream
|
# Call magika to guess from the stream
|
||||||
cur_pos = file_stream.tell()
|
cur_pos = file_stream.tell()
|
||||||
try:
|
try:
|
||||||
stream_bytes = file_stream.read()
|
result = self._magika.identify_stream(file_stream)
|
||||||
|
|
||||||
result = self._magika.identify_bytes(stream_bytes)
|
|
||||||
if result.status == "ok" and result.prediction.output.label != "unknown":
|
if result.status == "ok" and result.prediction.output.label != "unknown":
|
||||||
# If it's text, also guess the charset
|
# If it's text, also guess the charset
|
||||||
charset = None
|
charset = None
|
||||||
if result.prediction.output.is_text:
|
if result.prediction.output.is_text:
|
||||||
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
|
# Read the first 4k to guess the charset
|
||||||
|
file_stream.seek(cur_pos)
|
||||||
|
stream_page = file_stream.read(4096)
|
||||||
|
charset_result = charset_normalizer.from_bytes(stream_page).best()
|
||||||
|
|
||||||
if charset_result is not None:
|
if charset_result is not None:
|
||||||
charset = self._normalize_charset(charset_result.encoding)
|
charset = self._normalize_charset(charset_result.encoding)
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
|
|||||||
# mimetype or extension, so we'll special-case them here.
|
# mimetype or extension, so we'll special-case them here.
|
||||||
if test_vector.filename in [
|
if test_vector.filename in [
|
||||||
"test_outlook_msg.msg",
|
"test_outlook_msg.msg",
|
||||||
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
|
|
||||||
]:
|
]:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
|
|||||||
"""Test the conversion of a stream with no stream info."""
|
"""Test the conversion of a stream with no stream info."""
|
||||||
markitdown = MarkItDown()
|
markitdown = MarkItDown()
|
||||||
|
|
||||||
# For some limited exceptions, we can't guarantee the exact
|
|
||||||
# mimetype or extension, so we'll special-case them here.
|
|
||||||
if test_vector.filename in [
|
|
||||||
# This appears to be a subtle bug in magika.
|
|
||||||
# See: https://github.com/google/magika/issues/983
|
|
||||||
"test_mskanji.csv",
|
|
||||||
]:
|
|
||||||
return
|
|
||||||
|
|
||||||
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
|
||||||
result = markitdown.convert(stream, url=test_vector.url)
|
result = markitdown.convert(stream, url=test_vector.url)
|
||||||
for string in test_vector.must_include:
|
for string in test_vector.must_include:
|
||||||
|
|||||||
Reference in New Issue
Block a user