diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index d0f515e..1421852 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -27,8 +27,7 @@ dependencies = [ "beautifulsoup4", "requests", "markdownify", - "puremagic", - "pathvalidate", + "magika>=0.6.0rc1", "charset-normalizer", ] diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 079b65a..6a6957d 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -14,9 +14,6 @@ from typing import Any, List, Optional, Union, BinaryIO from pathlib import Path from urllib.parse import urlparse from warnings import warn - -# File-format detection -import puremagic import requests from ._stream_info import StreamInfo, _guess_stream_info_from_stream diff --git a/packages/markitdown/src/markitdown/_stream_info.py b/packages/markitdown/src/markitdown/_stream_info.py index 1eaa4d2..12c50f1 100644 --- a/packages/markitdown/src/markitdown/_stream_info.py +++ b/packages/markitdown/src/markitdown/_stream_info.py @@ -1,14 +1,10 @@ -import puremagic import mimetypes import os from dataclasses import dataclass, asdict from typing import Optional, BinaryIO, List, TypeVar, Type +from magika import Magika -# Mimetype substitutions table -MIMETYPE_SUBSTITUTIONS = { - "application/excel": "application/vnd.ms-excel", - "application/mspowerpoint": "application/vnd.ms-powerpoint", -} +magika = Magika() @dataclass(kw_only=True, frozen=True) @@ -59,6 +55,25 @@ def _guess_stream_info_from_stream( """ guesses: List[StreamInfo] = [] + # Call magika to guess from the stream + cur_pos = file_stream.tell() + try: + result = magika.identify_bytes(file_stream.read()) + if result.status == "ok" and result.prediction.output.label != "unknown": + extension = None + if len(result.prediction.output.extensions) > 0: + extension = result.prediction.output.extensions[0] + if extension and not extension.startswith("."): + extension = "." + extension + guesses.append( + StreamInfo( + mimetype=result.prediction.output.mime_type, + extension=extension, + ) + ) + finally: + file_stream.seek(cur_pos) + # Add a guess purely based on the filename hint if filename_hint: try: @@ -74,49 +89,4 @@ def _guess_stream_info_from_stream( ) ) - def _puremagic( - file_stream, filename_hint - ) -> List[puremagic.main.PureMagicWithConfidence]: - """Wrap guesses to handle exceptions.""" - try: - return puremagic.magic_stream(file_stream, filename=filename_hint) - except puremagic.main.PureError as e: - return [] - - cur_pos = file_stream.tell() - type_guesses = _puremagic(file_stream, filename_hint=filename_hint) - if len(type_guesses) == 0: - # Fix for: https://github.com/microsoft/markitdown/issues/222 - # If there are no guesses, then try again after trimming leading ASCII whitespaces. - # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' - # (space, tab, newline, carriage return, vertical tab, form feed). - - # Eat all the leading whitespace - file_stream.seek(cur_pos) - while True: - char = file_stream.read(1) - if not char: # End of file - break - if not char.isspace(): - file_stream.seek(file_stream.tell() - 1) - break - - # Try again - type_guesses = _puremagic(file_stream, filename_hint=filename_hint) - file_stream.seek(cur_pos) - - # Convert and return the guesses - for guess in type_guesses: - kwargs: dict[str, str] = {} - if guess.extension: - kwargs["extension"] = guess.extension - if guess.mime_type: - kwargs["mimetype"] = MIMETYPE_SUBSTITUTIONS.get( - guess.mime_type, guess.mime_type - ) - if len(kwargs) > 0: - # We don't add the filename_hint, because sometimes it's just a placeholder, - # and, in any case, doesn't add new information. - guesses.append(StreamInfo(**kwargs)) - return guesses