Merge branch 'main' into patch-1

2024-12-19 10:30:10 -08:00
parent b28f380a47 7147bef7d5
commit 423a01844a
5 changed files with 52 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 .vscode
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -160,3 +162,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 src/.DS_Store
 .DS_Store
--- a/README.md
+++ b/README.md
@@ -1,6 +1,9 @@
 # MarkItDown
 [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
 ![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
 MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
 It supports:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
 [project]
 name = "markitdown"
 dynamic = ["version"]
-description = ''
+description = 'Utility tool for converting various files to Markdown'
 readme = "README.md"
 requires-python = ">=3.10"
 license = "MIT"
--- a/src/markitdown/main.py
+++ b/src/markitdown/main.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: MIT
 import sys
 import argparse
 from textwrap import dedent
 from ._markitdown import MarkItDown
@@ -10,24 +11,26 @@ def main():
    parser = argparse.ArgumentParser(
        description="Convert various file formats to markdown.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
-        usage="""
+        usage=dedent(
-SYNTAX: 
+            """
-    
+            SYNTAX: 
-    markitdown <OPTIONAL: FILENAME>
+                
-    If FILENAME is empty, markitdown reads from stdin.
+                markitdown <OPTIONAL: FILENAME>
-
+                If FILENAME is empty, markitdown reads from stdin.
-EXAMPLE:
+            
-    
+            EXAMPLE:
-    markitdown example.pdf
+                
-    
+                markitdown example.pdf
-    OR
+                
-
+                OR
-    cat example.pdf | markitdown
+            
-
+                cat example.pdf | markitdown
-    OR 
+            
-
+                OR 
-    markitdown < example.pdf
+            
-""".strip(),
+                markitdown < example.pdf
            """
        ).strip(),
    )
    parser.add_argument("filename", nargs="?")
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -1133,27 +1133,33 @@ class ZipConverter(DocumentConverter):
        extracted_zip_folder_name = (
            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
        )
-        new_folder = os.path.normpath(
+        extraction_dir = os.path.normpath(
            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
        )
        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
        # Safety check for path traversal
        if not new_folder.startswith(os.path.dirname(local_path)):
            return DocumentConverterResult(
                title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
            )
        try:
-            # Extract the zip file
+            # Extract the zip file safely
            with zipfile.ZipFile(local_path, "r") as zipObj:
-                zipObj.extractall(path=new_folder)
+                # Safeguard against path traversal
                for member in zipObj.namelist():
                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
                    if (
                        not os.path.commonprefix([extraction_dir, member_path])
                        == extraction_dir
                    ):
                        raise ValueError(
                            f"Path traversal detected in zip file: {member}"
                        )
                # Extract all files safely
                zipObj.extractall(path=extraction_dir)
            # Process each extracted file
-            for root, dirs, files in os.walk(new_folder):
+            for root, dirs, files in os.walk(extraction_dir):
                for name in files:
                    file_path = os.path.join(root, name)
-                    relative_path = os.path.relpath(file_path, new_folder)
+                    relative_path = os.path.relpath(file_path, extraction_dir)
                    # Get file extension
                    _, file_extension = os.path.splitext(name)
@@ -1177,7 +1183,7 @@ class ZipConverter(DocumentConverter):
            # Clean up extracted files if specified
            if kwargs.get("cleanup_extracted", True):
-                shutil.rmtree(new_folder)
+                shutil.rmtree(extraction_dir)
            return DocumentConverterResult(title=None, text_content=md_content.strip())
@@ -1186,6 +1192,11 @@ class ZipConverter(DocumentConverter):
                title=None,
                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
            )
        except ValueError as ve:
            return DocumentConverterResult(
                title=None,
                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
            )
        except Exception as e:
            return DocumentConverterResult(
                title=None,