diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index 91bf07b..5e345b6 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.0a1" +__version__ = "0.1.0a2" diff --git a/packages/markitdown/src/markitdown/_stream_info.py b/packages/markitdown/src/markitdown/_stream_info.py index 1eaa4d2..af98da0 100644 --- a/packages/markitdown/src/markitdown/_stream_info.py +++ b/packages/markitdown/src/markitdown/_stream_info.py @@ -1,8 +1,9 @@ import puremagic import mimetypes +import zipfile import os from dataclasses import dataclass, asdict -from typing import Optional, BinaryIO, List, TypeVar, Type +from typing import Optional, BinaryIO, List, Union # Mimetype substitutions table MIMETYPE_SUBSTITUTIONS = { @@ -74,6 +75,20 @@ def _guess_stream_info_from_stream( ) ) + # If it looks like a zip use _guess_stream_info_from_zip rather than puremagic + cur_pos = file_stream.tell() + try: + header = file_stream.read(4) + file_stream.seek(cur_pos) + if header == b"PK\x03\x04": + zip_guess = _guess_stream_info_from_zip(file_stream) + if zip_guess: + guesses.append(zip_guess) + return guesses + finally: + file_stream.seek(cur_pos) + + # Fall back to using puremagic def _puremagic( file_stream, filename_hint ) -> List[puremagic.main.PureMagicWithConfidence]: @@ -120,3 +135,74 @@ def _guess_stream_info_from_stream( guesses.append(StreamInfo(**kwargs)) return guesses + + +def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]: + """ + Guess StreamInfo properties (mostly mimetype and extension) from a zip stream. + + Args: + - stream: The stream to guess the StreamInfo from. + + Returns the single best guess, or None if no guess could be made. + """ + + cur_pos = file_stream.tell() + try: + with zipfile.ZipFile(file_stream) as z: + table_of_contents = z.namelist() + + # OpenPackageFormat (OPF) file + if "[Content_Types].xml" in table_of_contents: + # Word file + if "word/document.xml" in table_of_contents: + return StreamInfo( + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + extension=".docx", + ) + + # Excel file + if "xl/workbook.xml" in table_of_contents: + return StreamInfo( + mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + extension=".xlsx", + ) + + # PowerPoint file + if "ppt/presentation.xml" in table_of_contents: + return StreamInfo( + mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", + extension=".pptx", + ) + + # Visio file + if "visio/document.xml" in table_of_contents: + return StreamInfo( + mimetype="application/vnd.ms-visio.drawing", + extension=".vsd", + ) + + # XPS file + if "FixedDocSeq.fdseq" in table_of_contents: + return StreamInfo( + mimetype="application/vnd.ms-xpsdocument", + extension=".xps", + ) + + # EPUB, or similar + if "mimetype" in table_of_contents: + _mimetype = z.read("mimetype").decode("ascii").strip() + _extension = mimetypes.guess_extension(_mimetype) + return StreamInfo(mimetype=_mimetype, extension=_extension) + + # JAR + if "META-INF/MANIFEST.MF" in table_of_contents: + return StreamInfo(mimetype="application/java-archive", extension=".jar") + + # If we made it this far, we couldn't identify the file + return StreamInfo(mimetype="application/zip", extension=".zip") + + except zipfile.BadZipFile: + return None + finally: + file_stream.seek(cur_pos)