From 9182923375fb84892d88acaf3ca9d361bce53b0b Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 28 Feb 2025 09:54:19 -0800 Subject: [PATCH] Don't have ZipConverter accept OOXML files. This will never yield a good result. (#1078) --- .../markitdown/src/markitdown/converters/_zip_converter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index 026900d..e2b5fe6 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -77,6 +77,10 @@ class ZipConverter(DocumentConverter): try: # Extract the zip file safely with zipfile.ZipFile(local_path, "r") as zipObj: + # Bail if we discover it's an Office OOXML file + if "[Content_Types].xml" in zipObj.namelist(): + return None + # Safeguard against path traversal for member in zipObj.namelist(): member_path = os.path.normpath(os.path.join(extraction_dir, member))