From 6e4caac70d63c87a532be773b2dc3f330f9fdbda Mon Sep 17 00:00:00 2001 From: Joel Esler Date: Wed, 18 Dec 2024 13:12:55 -0500 Subject: [PATCH 1/8] Safeguard against path traversal for ZipConverter fix: prevent path traversal vulnerabilities in ZipConverter Added a secure check for path traversal vulnerabilities in the ZipConverter class. Now validates extracted file paths using `os.path.commonprefix` to ensure all files remain within the intended extraction directory. Raises a `ValueError` if a path traversal attempt is detected. - Normalized file paths using `os.path.normpath`. - Added specific exception handling for `zipfile.BadZipFile` and traversal errors. - Ensured cleanup of extracted files after processing when `cleanup_extracted` is enabled. --- .gitignore | 2 ++ src/markitdown/_markitdown.py | 30 ++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 82f9275..b6139eb 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +src/.DS_Store +.DS_Store diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2e7e5ff..28770f4 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1133,27 +1133,28 @@ class ZipConverter(DocumentConverter): extracted_zip_folder_name = ( f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" ) - new_folder = os.path.normpath( + extraction_dir = os.path.normpath( os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) ) md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" - # Safety check for path traversal - if not new_folder.startswith(os.path.dirname(local_path)): - return DocumentConverterResult( - title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}" - ) - try: - # Extract the zip file + # Extract the zip file safely with zipfile.ZipFile(local_path, "r") as zipObj: - zipObj.extractall(path=new_folder) + # Safeguard against path traversal + for member in zipObj.namelist(): + member_path = os.path.normpath(os.path.join(extraction_dir, member)) + if not os.path.commonprefix([extraction_dir, member_path]) == extraction_dir: + raise ValueError(f"Path traversal detected in zip file: {member}") + + # Extract all files safely + zipObj.extractall(path=extraction_dir) # Process each extracted file - for root, dirs, files in os.walk(new_folder): + for root, dirs, files in os.walk(extraction_dir): for name in files: file_path = os.path.join(root, name) - relative_path = os.path.relpath(file_path, new_folder) + relative_path = os.path.relpath(file_path, extraction_dir) # Get file extension _, file_extension = os.path.splitext(name) @@ -1177,7 +1178,7 @@ class ZipConverter(DocumentConverter): # Clean up extracted files if specified if kwargs.get("cleanup_extracted", True): - shutil.rmtree(new_folder) + shutil.rmtree(extraction_dir) return DocumentConverterResult(title=None, text_content=md_content.strip()) @@ -1186,6 +1187,11 @@ class ZipConverter(DocumentConverter): title=None, text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", ) + except ValueError as ve: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}", + ) except Exception as e: return DocumentConverterResult( title=None, From 39410d01df6ecb42a81c4219bdbd3ff6e21b8bfd Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Wed, 18 Dec 2024 14:22:58 -0500 Subject: [PATCH 2/8] Update CLI helpdoc formatting to allow indentation in code Use `textwrap.dedent()` to allow indented cli-helpdoc in `__main__.py` file. The indentation increases readability, while `textwrap.dedent` helps maintain the same functionality without breaking code. --- src/markitdown/__main__.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 2d53173..9c48cd4 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: MIT import sys import argparse +from textwrap import dedent from ._markitdown import MarkItDown @@ -10,24 +11,24 @@ def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=""" -SYNTAX: - - markitdown - If FILENAME is empty, markitdown reads from stdin. - -EXAMPLE: - - markitdown example.pdf - - OR - - cat example.pdf | markitdown - - OR - - markitdown < example.pdf -""".strip(), + usage=dedent(""" + SYNTAX: + + markitdown + If FILENAME is empty, markitdown reads from stdin. + + EXAMPLE: + + markitdown example.pdf + + OR + + cat example.pdf | markitdown + + OR + + markitdown < example.pdf + """).strip(), ) parser.add_argument("filename", nargs="?") From 5fc70864f23c75ea315bfb1c011a4ed82a76ccf0 Mon Sep 17 00:00:00 2001 From: gagb Date: Wed, 18 Dec 2024 11:46:39 -0800 Subject: [PATCH 3/8] Run pre-commit --- src/markitdown/_markitdown.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 28770f4..040a586 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1144,9 +1144,14 @@ class ZipConverter(DocumentConverter): # Safeguard against path traversal for member in zipObj.namelist(): member_path = os.path.normpath(os.path.join(extraction_dir, member)) - if not os.path.commonprefix([extraction_dir, member_path]) == extraction_dir: - raise ValueError(f"Path traversal detected in zip file: {member}") - + if ( + not os.path.commonprefix([extraction_dir, member_path]) + == extraction_dir + ): + raise ValueError( + f"Path traversal detected in zip file: {member}" + ) + # Extract all files safely zipObj.extractall(path=extraction_dir) From 356e895306baf01633ebacd5888487321c940f6a Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Wed, 18 Dec 2024 21:25:23 +0000 Subject: [PATCH 4/8] update formatting with pre-commit --- src/markitdown/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 9c48cd4..be2a0f2 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -11,7 +11,8 @@ def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=dedent(""" + usage=dedent( + """ SYNTAX: markitdown @@ -28,7 +29,8 @@ def main(): OR markitdown < example.pdf - """).strip(), + """ + ).strip(), ) parser.add_argument("filename", nargs="?") From 1384e8072578278977ec6d67c852f9c2f79d799e Mon Sep 17 00:00:00 2001 From: Sugato Ray Date: Wed, 18 Dec 2024 21:24:57 +0000 Subject: [PATCH 5/8] update .gitignore to exclude .vscode folder --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 82f9275..e6c8f2e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 6e1b9a7402f425b3a740051a35db0fcd336ce549 Mon Sep 17 00:00:00 2001 From: gagb Date: Wed, 18 Dec 2024 13:46:10 -0800 Subject: [PATCH 6/8] Run precommit --- src/markitdown/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py index 9c48cd4..be2a0f2 100644 --- a/src/markitdown/__main__.py +++ b/src/markitdown/__main__.py @@ -11,7 +11,8 @@ def main(): parser = argparse.ArgumentParser( description="Convert various file formats to markdown.", formatter_class=argparse.RawDescriptionHelpFormatter, - usage=dedent(""" + usage=dedent( + """ SYNTAX: markitdown @@ -28,7 +29,8 @@ def main(): OR markitdown < example.pdf - """).strip(), + """ + ).strip(), ) parser.add_argument("filename", nargs="?") From a2743a5314936fdfb83e17978323a463e2111bda Mon Sep 17 00:00:00 2001 From: gagb Date: Wed, 18 Dec 2024 14:26:36 -0800 Subject: [PATCH 7/8] Add downloads badge --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 75c2ba0..978327c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # MarkItDown [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) +![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown) + + MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). It supports: From c86287b7e3c6b41138c9b8e5e9097c359ea32fbc Mon Sep 17 00:00:00 2001 From: lumin Date: Wed, 18 Dec 2024 18:22:41 +0900 Subject: [PATCH 8/8] feat: add project description in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c5bd58b..3e14cec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "markitdown" dynamic = ["version"] -description = '' +description = 'Utility tool for converting various files to Markdown' readme = "README.md" requires-python = ">=3.10" license = "MIT"