1 Commits

Author SHA1 Message Date
Adam Fourney
f17bc21c9d If files use zip packaging, be smarter about inspecting their types. 2025-03-07 23:06:56 -08:00
2 changed files with 88 additions and 2 deletions

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.0a1"
__version__ = "0.1.0a2"

View File

@@ -1,8 +1,9 @@
import puremagic
import mimetypes
import zipfile
import os
from dataclasses import dataclass, asdict
from typing import Optional, BinaryIO, List, TypeVar, Type
from typing import Optional, BinaryIO, List, Union
# Mimetype substitutions table
MIMETYPE_SUBSTITUTIONS = {
@@ -74,6 +75,20 @@ def _guess_stream_info_from_stream(
)
)
# If it looks like a zip use _guess_stream_info_from_zip rather than puremagic
cur_pos = file_stream.tell()
try:
header = file_stream.read(4)
file_stream.seek(cur_pos)
if header == b"PK\x03\x04":
zip_guess = _guess_stream_info_from_zip(file_stream)
if zip_guess:
guesses.append(zip_guess)
return guesses
finally:
file_stream.seek(cur_pos)
# Fall back to using puremagic
def _puremagic(
file_stream, filename_hint
) -> List[puremagic.main.PureMagicWithConfidence]:
@@ -120,3 +135,74 @@ def _guess_stream_info_from_stream(
guesses.append(StreamInfo(**kwargs))
return guesses
def _guess_stream_info_from_zip(file_stream: BinaryIO) -> Union[None, StreamInfo]:
"""
Guess StreamInfo properties (mostly mimetype and extension) from a zip stream.
Args:
- stream: The stream to guess the StreamInfo from.
Returns the single best guess, or None if no guess could be made.
"""
cur_pos = file_stream.tell()
try:
with zipfile.ZipFile(file_stream) as z:
table_of_contents = z.namelist()
# OpenPackageFormat (OPF) file
if "[Content_Types].xml" in table_of_contents:
# Word file
if "word/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
extension=".docx",
)
# Excel file
if "xl/workbook.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
extension=".xlsx",
)
# PowerPoint file
if "ppt/presentation.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
extension=".pptx",
)
# Visio file
if "visio/document.xml" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-visio.drawing",
extension=".vsd",
)
# XPS file
if "FixedDocSeq.fdseq" in table_of_contents:
return StreamInfo(
mimetype="application/vnd.ms-xpsdocument",
extension=".xps",
)
# EPUB, or similar
if "mimetype" in table_of_contents:
_mimetype = z.read("mimetype").decode("ascii").strip()
_extension = mimetypes.guess_extension(_mimetype)
return StreamInfo(mimetype=_mimetype, extension=_extension)
# JAR
if "META-INF/MANIFEST.MF" in table_of_contents:
return StreamInfo(mimetype="application/java-archive", extension=".jar")
# If we made it this far, we couldn't identify the file
return StreamInfo(mimetype="application/zip", extension=".zip")
except zipfile.BadZipFile:
return None
finally:
file_stream.seek(cur_pos)