From c5f70b904ffc0f895fc33dbcb16cc9e04aaa3301 Mon Sep 17 00:00:00 2001 From: afourney Date: Mon, 17 Mar 2025 07:39:19 -0700 Subject: [PATCH] Have magika read from the stream. (#1136) --- packages/markitdown/pyproject.toml | 2 +- packages/markitdown/src/markitdown/__about__.py | 2 +- packages/markitdown/src/markitdown/_markitdown.py | 10 ++++++---- packages/markitdown/tests/test_module_vectors.py | 10 ---------- 4 files changed, 8 insertions(+), 16 deletions(-) diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 0324ed4..bd38193 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "beautifulsoup4", "requests", "markdownify", - "magika>=0.6.1rc2", + "magika>=0.6.1rc3", "charset-normalizer", ] diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index 463e0c7..e54f3bc 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.0a3" +__version__ = "0.1.0a4" diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 2e9965a..78319eb 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -610,14 +610,16 @@ class MarkItDown: # Call magika to guess from the stream cur_pos = file_stream.tell() try: - stream_bytes = file_stream.read() - - result = self._magika.identify_bytes(stream_bytes) + result = self._magika.identify_stream(file_stream) if result.status == "ok" and result.prediction.output.label != "unknown": # If it's text, also guess the charset charset = None if result.prediction.output.is_text: - charset_result = charset_normalizer.from_bytes(stream_bytes).best() + # Read the first 4k to guess the charset + file_stream.seek(cur_pos) + stream_page = file_stream.read(4096) + charset_result = charset_normalizer.from_bytes(stream_page).best() + if charset_result is not None: charset = self._normalize_charset(charset_result.encoding) diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py index 873be75..9afffa5 100644 --- a/packages/markitdown/tests/test_module_vectors.py +++ b/packages/markitdown/tests/test_module_vectors.py @@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector): # mimetype or extension, so we'll special-case them here. if test_vector.filename in [ "test_outlook_msg.msg", - "test_mskanji.csv", # See: https://github.com/google/magika/issues/983 ]: return @@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector): """Test the conversion of a stream with no stream info.""" markitdown = MarkItDown() - # For some limited exceptions, we can't guarantee the exact - # mimetype or extension, so we'll special-case them here. - if test_vector.filename in [ - # This appears to be a subtle bug in magika. - # See: https://github.com/google/magika/issues/983 - "test_mskanji.csv", - ]: - return - with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: result = markitdown.convert(stream, url=test_vector.url) for string in test_vector.must_include: