From c5f70b904ffc0f895fc33dbcb16cc9e04aaa3301 Mon Sep 17 00:00:00 2001
From: afourney <adamfo@microsoft.com>
Date: Mon, 17 Mar 2025 07:39:19 -0700
Subject: [PATCH] Have magika read from the stream. (#1136)

---
 packages/markitdown/pyproject.toml                |  2 +-
 packages/markitdown/src/markitdown/__about__.py   |  2 +-
 packages/markitdown/src/markitdown/_markitdown.py | 10 ++++++----
 packages/markitdown/tests/test_module_vectors.py  | 10 ----------
 4 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
index 0324ed4..bd38193 100644
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
   "beautifulsoup4",
   "requests",
   "markdownify",
-  "magika>=0.6.1rc2",
+  "magika>=0.6.1rc3",
   "charset-normalizer",
 ]
 
diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
index 463e0c7..e54f3bc 100644
--- a/packages/markitdown/src/markitdown/__about__.py
+++ b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a3"
+__version__ = "0.1.0a4"
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
index 2e9965a..78319eb 100644
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -610,14 +610,16 @@ class MarkItDown:
         # Call magika to guess from the stream
         cur_pos = file_stream.tell()
         try:
-            stream_bytes = file_stream.read()
-
-            result = self._magika.identify_bytes(stream_bytes)
+            result = self._magika.identify_stream(file_stream)
             if result.status == "ok" and result.prediction.output.label != "unknown":
                 # If it's text, also guess the charset
                 charset = None
                 if result.prediction.output.is_text:
-                    charset_result = charset_normalizer.from_bytes(stream_bytes).best()
+                    # Read the first 4k to guess the charset
+                    file_stream.seek(cur_pos)
+                    stream_page = file_stream.read(4096)
+                    charset_result = charset_normalizer.from_bytes(stream_page).best()
+
                     if charset_result is not None:
                         charset = self._normalize_charset(charset_result.encoding)
 
diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py
index 873be75..9afffa5 100644
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
         # mimetype or extension, so we'll special-case them here.
         if test_vector.filename in [
             "test_outlook_msg.msg",
-            "test_mskanji.csv",  # See: https://github.com/google/magika/issues/983
         ]:
             return
 
@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
     """Test the conversion of a stream with no stream info."""
     markitdown = MarkItDown()
 
-    # For some limited exceptions, we can't guarantee the exact
-    # mimetype or extension, so we'll special-case them here.
-    if test_vector.filename in [
-        # This appears to be a subtle bug in magika.
-        # See: https://github.com/google/magika/issues/983
-        "test_mskanji.csv",
-    ]:
-        return
-
     with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
         result = markitdown.convert(stream, url=test_vector.url)
         for string in test_vector.must_include: