Merge branch 'main' into patch-1

This commit is contained in:
gagb
2024-12-19 10:30:10 -08:00
committed by GitHub
5 changed files with 52 additions and 31 deletions

4
.gitignore vendored
View File

@@ -1,3 +1,5 @@
.vscode
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
@@ -160,3 +162,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
src/.DS_Store
.DS_Store

View File

@@ -1,6 +1,9 @@
# MarkItDown # MarkItDown
[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
![PyPI - Downloads](https://img.shields.io/pypi/dd/markitdown)
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
It supports: It supports:

View File

@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
[project] [project]
name = "markitdown" name = "markitdown"
dynamic = ["version"] dynamic = ["version"]
description = '' description = 'Utility tool for converting various files to Markdown'
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"
license = "MIT" license = "MIT"

View File

@@ -3,6 +3,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
import sys import sys
import argparse import argparse
from textwrap import dedent
from ._markitdown import MarkItDown from ._markitdown import MarkItDown
@@ -10,24 +11,26 @@ def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Convert various file formats to markdown.", description="Convert various file formats to markdown.",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
usage=""" usage=dedent(
SYNTAX: """
SYNTAX:
markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin. markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin.
EXAMPLE:
EXAMPLE:
markitdown example.pdf
markitdown example.pdf
OR
OR
cat example.pdf | markitdown
cat example.pdf | markitdown
OR
OR
markitdown < example.pdf
""".strip(), markitdown < example.pdf
"""
).strip(),
) )
parser.add_argument("filename", nargs="?") parser.add_argument("filename", nargs="?")

View File

@@ -1133,27 +1133,33 @@ class ZipConverter(DocumentConverter):
extracted_zip_folder_name = ( extracted_zip_folder_name = (
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
) )
new_folder = os.path.normpath( extraction_dir = os.path.normpath(
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
) )
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
# Safety check for path traversal
if not new_folder.startswith(os.path.dirname(local_path)):
return DocumentConverterResult(
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
)
try: try:
# Extract the zip file # Extract the zip file safely
with zipfile.ZipFile(local_path, "r") as zipObj: with zipfile.ZipFile(local_path, "r") as zipObj:
zipObj.extractall(path=new_folder) # Safeguard against path traversal
for member in zipObj.namelist():
member_path = os.path.normpath(os.path.join(extraction_dir, member))
if (
not os.path.commonprefix([extraction_dir, member_path])
== extraction_dir
):
raise ValueError(
f"Path traversal detected in zip file: {member}"
)
# Extract all files safely
zipObj.extractall(path=extraction_dir)
# Process each extracted file # Process each extracted file
for root, dirs, files in os.walk(new_folder): for root, dirs, files in os.walk(extraction_dir):
for name in files: for name in files:
file_path = os.path.join(root, name) file_path = os.path.join(root, name)
relative_path = os.path.relpath(file_path, new_folder) relative_path = os.path.relpath(file_path, extraction_dir)
# Get file extension # Get file extension
_, file_extension = os.path.splitext(name) _, file_extension = os.path.splitext(name)
@@ -1177,7 +1183,7 @@ class ZipConverter(DocumentConverter):
# Clean up extracted files if specified # Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True): if kwargs.get("cleanup_extracted", True):
shutil.rmtree(new_folder) shutil.rmtree(extraction_dir)
return DocumentConverterResult(title=None, text_content=md_content.strip()) return DocumentConverterResult(title=None, text_content=md_content.strip())
@@ -1186,6 +1192,11 @@ class ZipConverter(DocumentConverter):
title=None, title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
) )
except ValueError as ve:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
)
except Exception as e: except Exception as e:
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,