Merge branch 'main' into patch-1
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,3 +1,5 @@
|
|||||||
|
.vscode
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
@@ -160,3 +162,5 @@ cython_debug/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
src/.DS_Store
|
||||||
|
.DS_Store
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
# MarkItDown
|
# MarkItDown
|
||||||
|
|
||||||
[](https://pypi.org/project/markitdown/)
|
[](https://pypi.org/project/markitdown/)
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||||
It supports:
|
It supports:
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
|
|||||||
[project]
|
[project]
|
||||||
name = "markitdown"
|
name = "markitdown"
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
description = ''
|
description = 'Utility tool for converting various files to Markdown'
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
|
from textwrap import dedent
|
||||||
from ._markitdown import MarkItDown
|
from ._markitdown import MarkItDown
|
||||||
|
|
||||||
|
|
||||||
@@ -10,24 +11,26 @@ def main():
|
|||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Convert various file formats to markdown.",
|
description="Convert various file formats to markdown.",
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
usage="""
|
usage=dedent(
|
||||||
SYNTAX:
|
"""
|
||||||
|
SYNTAX:
|
||||||
markitdown <OPTIONAL: FILENAME>
|
|
||||||
If FILENAME is empty, markitdown reads from stdin.
|
markitdown <OPTIONAL: FILENAME>
|
||||||
|
If FILENAME is empty, markitdown reads from stdin.
|
||||||
EXAMPLE:
|
|
||||||
|
EXAMPLE:
|
||||||
markitdown example.pdf
|
|
||||||
|
markitdown example.pdf
|
||||||
OR
|
|
||||||
|
OR
|
||||||
cat example.pdf | markitdown
|
|
||||||
|
cat example.pdf | markitdown
|
||||||
OR
|
|
||||||
|
OR
|
||||||
markitdown < example.pdf
|
|
||||||
""".strip(),
|
markitdown < example.pdf
|
||||||
|
"""
|
||||||
|
).strip(),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("filename", nargs="?")
|
parser.add_argument("filename", nargs="?")
|
||||||
|
|||||||
@@ -1133,27 +1133,33 @@ class ZipConverter(DocumentConverter):
|
|||||||
extracted_zip_folder_name = (
|
extracted_zip_folder_name = (
|
||||||
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
|
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
|
||||||
)
|
)
|
||||||
new_folder = os.path.normpath(
|
extraction_dir = os.path.normpath(
|
||||||
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
|
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
|
||||||
)
|
)
|
||||||
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
|
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
|
||||||
|
|
||||||
# Safety check for path traversal
|
|
||||||
if not new_folder.startswith(os.path.dirname(local_path)):
|
|
||||||
return DocumentConverterResult(
|
|
||||||
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract the zip file
|
# Extract the zip file safely
|
||||||
with zipfile.ZipFile(local_path, "r") as zipObj:
|
with zipfile.ZipFile(local_path, "r") as zipObj:
|
||||||
zipObj.extractall(path=new_folder)
|
# Safeguard against path traversal
|
||||||
|
for member in zipObj.namelist():
|
||||||
|
member_path = os.path.normpath(os.path.join(extraction_dir, member))
|
||||||
|
if (
|
||||||
|
not os.path.commonprefix([extraction_dir, member_path])
|
||||||
|
== extraction_dir
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
f"Path traversal detected in zip file: {member}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract all files safely
|
||||||
|
zipObj.extractall(path=extraction_dir)
|
||||||
|
|
||||||
# Process each extracted file
|
# Process each extracted file
|
||||||
for root, dirs, files in os.walk(new_folder):
|
for root, dirs, files in os.walk(extraction_dir):
|
||||||
for name in files:
|
for name in files:
|
||||||
file_path = os.path.join(root, name)
|
file_path = os.path.join(root, name)
|
||||||
relative_path = os.path.relpath(file_path, new_folder)
|
relative_path = os.path.relpath(file_path, extraction_dir)
|
||||||
|
|
||||||
# Get file extension
|
# Get file extension
|
||||||
_, file_extension = os.path.splitext(name)
|
_, file_extension = os.path.splitext(name)
|
||||||
@@ -1177,7 +1183,7 @@ class ZipConverter(DocumentConverter):
|
|||||||
|
|
||||||
# Clean up extracted files if specified
|
# Clean up extracted files if specified
|
||||||
if kwargs.get("cleanup_extracted", True):
|
if kwargs.get("cleanup_extracted", True):
|
||||||
shutil.rmtree(new_folder)
|
shutil.rmtree(extraction_dir)
|
||||||
|
|
||||||
return DocumentConverterResult(title=None, text_content=md_content.strip())
|
return DocumentConverterResult(title=None, text_content=md_content.strip())
|
||||||
|
|
||||||
@@ -1186,6 +1192,11 @@ class ZipConverter(DocumentConverter):
|
|||||||
title=None,
|
title=None,
|
||||||
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
||||||
)
|
)
|
||||||
|
except ValueError as ve:
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return DocumentConverterResult(
|
return DocumentConverterResult(
|
||||||
title=None,
|
title=None,
|
||||||
|
|||||||
Reference in New Issue
Block a user