@@ -14,6 +14,7 @@ It presently supports:
|
|||||||
- Audio (EXIF metadata, and speech transcription)
|
- Audio (EXIF metadata, and speech transcription)
|
||||||
- HTML (special handling of Wikipedia, etc.)
|
- HTML (special handling of Wikipedia, etc.)
|
||||||
- Various other text-based formats (csv, json, xml, etc.)
|
- Various other text-based formats (csv, json, xml, etc.)
|
||||||
|
- ZIP (Iterates over contents and converts each file)
|
||||||
|
|
||||||
# Installation
|
# Installation
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import traceback
|
import traceback
|
||||||
|
import zipfile
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
||||||
|
|
||||||
@@ -836,6 +837,124 @@ class ImageConverter(MediaConverter):
|
|||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
|
class ZipConverter(DocumentConverter):
|
||||||
|
"""Converts ZIP files to markdown by extracting and converting all contained files.
|
||||||
|
|
||||||
|
The converter extracts the ZIP contents to a temporary directory, processes each file
|
||||||
|
using appropriate converters based on file extensions, and then combines the results
|
||||||
|
into a single markdown document. The temporary directory is cleaned up after processing.
|
||||||
|
|
||||||
|
Example output format:
|
||||||
|
```markdown
|
||||||
|
Content from the zip file `example.zip`:
|
||||||
|
|
||||||
|
## File: docs/readme.txt
|
||||||
|
|
||||||
|
This is the content of readme.txt
|
||||||
|
Multiple lines are preserved
|
||||||
|
|
||||||
|
## File: images/example.jpg
|
||||||
|
|
||||||
|
ImageSize: 1920x1080
|
||||||
|
DateTimeOriginal: 2024-02-15 14:30:00
|
||||||
|
Description: A beautiful landscape photo
|
||||||
|
|
||||||
|
## File: data/report.xlsx
|
||||||
|
|
||||||
|
## Sheet1
|
||||||
|
| Column1 | Column2 | Column3 |
|
||||||
|
|---------|---------|---------|
|
||||||
|
| data1 | data2 | data3 |
|
||||||
|
| data4 | data5 | data6 |
|
||||||
|
```
|
||||||
|
|
||||||
|
Key features:
|
||||||
|
- Maintains original file structure in headings
|
||||||
|
- Processes nested files recursively
|
||||||
|
- Uses appropriate converters for each file type
|
||||||
|
- Preserves formatting of converted content
|
||||||
|
- Cleans up temporary files after processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self, local_path: str, **kwargs: Any
|
||||||
|
) -> Union[None, DocumentConverterResult]:
|
||||||
|
# Bail if not a ZIP
|
||||||
|
extension = kwargs.get("file_extension", "")
|
||||||
|
if extension.lower() != ".zip":
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Get parent converters list if available
|
||||||
|
parent_converters = kwargs.get("_parent_converters", [])
|
||||||
|
if not parent_converters:
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
|
||||||
|
)
|
||||||
|
|
||||||
|
extracted_zip_folder_name = (
|
||||||
|
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
|
||||||
|
)
|
||||||
|
new_folder = os.path.normpath(
|
||||||
|
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
|
||||||
|
)
|
||||||
|
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
|
||||||
|
|
||||||
|
# Safety check for path traversal
|
||||||
|
if not new_folder.startswith(os.path.dirname(local_path)):
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Extract the zip file
|
||||||
|
with zipfile.ZipFile(local_path, "r") as zipObj:
|
||||||
|
zipObj.extractall(path=new_folder)
|
||||||
|
|
||||||
|
# Process each extracted file
|
||||||
|
for root, dirs, files in os.walk(new_folder):
|
||||||
|
for name in files:
|
||||||
|
file_path = os.path.join(root, name)
|
||||||
|
relative_path = os.path.relpath(file_path, new_folder)
|
||||||
|
|
||||||
|
# Get file extension
|
||||||
|
_, file_extension = os.path.splitext(name)
|
||||||
|
|
||||||
|
# Update kwargs for the file
|
||||||
|
file_kwargs = kwargs.copy()
|
||||||
|
file_kwargs["file_extension"] = file_extension
|
||||||
|
file_kwargs["_parent_converters"] = parent_converters
|
||||||
|
|
||||||
|
# Try converting the file using available converters
|
||||||
|
for converter in parent_converters:
|
||||||
|
# Skip the zip converter to avoid infinite recursion
|
||||||
|
if isinstance(converter, ZipConverter):
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = converter.convert(file_path, **file_kwargs)
|
||||||
|
if result is not None:
|
||||||
|
md_content += f"\n## File: {relative_path}\n\n"
|
||||||
|
md_content += result.text_content + "\n\n"
|
||||||
|
break
|
||||||
|
|
||||||
|
# Clean up extracted files if specified
|
||||||
|
if kwargs.get("cleanup_extracted", True):
|
||||||
|
shutil.rmtree(new_folder)
|
||||||
|
|
||||||
|
return DocumentConverterResult(title=None, text_content=md_content.strip())
|
||||||
|
|
||||||
|
except zipfile.BadZipFile:
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
return DocumentConverterResult(
|
||||||
|
title=None,
|
||||||
|
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class FileConversionException(BaseException):
|
class FileConversionException(BaseException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -879,6 +998,7 @@ class MarkItDown:
|
|||||||
self.register_page_converter(Mp3Converter())
|
self.register_page_converter(Mp3Converter())
|
||||||
self.register_page_converter(ImageConverter())
|
self.register_page_converter(ImageConverter())
|
||||||
self.register_page_converter(PdfConverter())
|
self.register_page_converter(PdfConverter())
|
||||||
|
self.register_page_converter(ZipConverter())
|
||||||
|
|
||||||
def convert(
|
def convert(
|
||||||
self, source: Union[str, requests.Response], **kwargs: Any
|
self, source: Union[str, requests.Response], **kwargs: Any
|
||||||
@@ -1034,6 +1154,8 @@ class MarkItDown:
|
|||||||
|
|
||||||
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
||||||
_kwargs["mlm_model"] = self._mlm_model
|
_kwargs["mlm_model"] = self._mlm_model
|
||||||
|
# Add the list of converters for nested processing
|
||||||
|
_kwargs["_parent_converters"] = self._page_converters
|
||||||
|
|
||||||
# If we hit an error log it and keep trying
|
# If we hit an error log it and keep trying
|
||||||
try:
|
try:
|
||||||
|
|||||||
BIN
tests/test_files/test_files.zip
Normal file
BIN
tests/test_files/test_files.zip
Normal file
Binary file not shown.
@@ -151,6 +151,12 @@ def test_markitdown_local() -> None:
|
|||||||
text_content = result.text_content.replace("\\", "")
|
text_content = result.text_content.replace("\\", "")
|
||||||
assert test_string in text_content
|
assert test_string in text_content
|
||||||
|
|
||||||
|
# Test ZIP file processing
|
||||||
|
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
|
||||||
|
for test_string in DOCX_TEST_STRINGS:
|
||||||
|
text_content = result.text_content.replace("\\", "")
|
||||||
|
assert test_string in text_content
|
||||||
|
|
||||||
# Test Wikipedia processing
|
# Test Wikipedia processing
|
||||||
result = markitdown.convert(
|
result = markitdown.convert(
|
||||||
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
|
||||||
|
|||||||
Reference in New Issue
Block a user