diff --git a/README.md b/README.md index df7189d..f0f2e83 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # MarkItDown +[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) + The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) It presently supports: @@ -12,6 +14,7 @@ It presently supports: - Audio (EXIF metadata, and speech transcription) - HTML (special handling of Wikipedia, etc.) - Various other text-based formats (csv, json, xml, etc.) +- ZIP (Iterates over contents and converts each file) # Installation @@ -27,7 +30,6 @@ or from the source pip install -e . ``` - # Usage The API is simple: @@ -39,6 +41,25 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` +To use this as a command-line utility, install it and then run it like this: + +```bash +markitdown path-to-file.pdf +``` + +This will output Markdown to standard output. You can save it like this: + +```bash +markitdown path-to-file.pdf > document.md +``` + +You can pipe content to standard input by omitting the argument: + +```bash +cat path-to-file.pdf | markitdown +``` + + You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. ```python diff --git a/pyproject.toml b/pyproject.toml index 74df032..756380a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "youtube-transcript-api", "SpeechRecognition", "pathvalidate", + "charset-normalizer", ] [project.urls] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 63f7c9d..714911b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -12,6 +12,7 @@ import subprocess import sys import tempfile import traceback +import zipfile from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse @@ -26,6 +27,7 @@ import pptx import puremagic import requests from bs4 import BeautifulSoup +from charset_normalizer import from_path # Optional Transcription support try: @@ -161,9 +163,7 @@ class PlainTextConverter(DocumentConverter): elif "text/" not in content_type.lower(): return None - text_content = "" - with open(local_path, "rt", encoding="utf-8") as fh: - text_content = fh.read() + text_content = str(from_path(local_path).best()) return DocumentConverterResult( title=None, text_content=text_content, @@ -864,6 +864,124 @@ class ImageConverter(MediaConverter): return response.choices[0].message.content +class ZipConverter(DocumentConverter): + """Converts ZIP files to markdown by extracting and converting all contained files. + + The converter extracts the ZIP contents to a temporary directory, processes each file + using appropriate converters based on file extensions, and then combines the results + into a single markdown document. The temporary directory is cleaned up after processing. + + Example output format: + ```markdown + Content from the zip file `example.zip`: + + ## File: docs/readme.txt + + This is the content of readme.txt + Multiple lines are preserved + + ## File: images/example.jpg + + ImageSize: 1920x1080 + DateTimeOriginal: 2024-02-15 14:30:00 + Description: A beautiful landscape photo + + ## File: data/report.xlsx + + ## Sheet1 + | Column1 | Column2 | Column3 | + |---------|---------|---------| + | data1 | data2 | data3 | + | data4 | data5 | data6 | + ``` + + Key features: + - Maintains original file structure in headings + - Processes nested files recursively + - Uses appropriate converters for each file type + - Preserves formatting of converted content + - Cleans up temporary files after processing + """ + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not a ZIP + extension = kwargs.get("file_extension", "") + if extension.lower() != ".zip": + return None + + # Get parent converters list if available + parent_converters = kwargs.get("_parent_converters", []) + if not parent_converters: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] No converters available to process zip contents from: {local_path}", + ) + + extracted_zip_folder_name = ( + f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" + ) + new_folder = os.path.normpath( + os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) + ) + md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" + + # Safety check for path traversal + if not new_folder.startswith(os.path.dirname(local_path)): + return DocumentConverterResult( + title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}" + ) + + try: + # Extract the zip file + with zipfile.ZipFile(local_path, "r") as zipObj: + zipObj.extractall(path=new_folder) + + # Process each extracted file + for root, dirs, files in os.walk(new_folder): + for name in files: + file_path = os.path.join(root, name) + relative_path = os.path.relpath(file_path, new_folder) + + # Get file extension + _, file_extension = os.path.splitext(name) + + # Update kwargs for the file + file_kwargs = kwargs.copy() + file_kwargs["file_extension"] = file_extension + file_kwargs["_parent_converters"] = parent_converters + + # Try converting the file using available converters + for converter in parent_converters: + # Skip the zip converter to avoid infinite recursion + if isinstance(converter, ZipConverter): + continue + + result = converter.convert(file_path, **file_kwargs) + if result is not None: + md_content += f"\n## File: {relative_path}\n\n" + md_content += result.text_content + "\n\n" + break + + # Clean up extracted files if specified + if kwargs.get("cleanup_extracted", True): + shutil.rmtree(new_folder) + + return DocumentConverterResult(title=None, text_content=md_content.strip()) + + except zipfile.BadZipFile: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", + ) + except Exception as e: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", + ) + + class FileConversionException(BaseException): pass @@ -907,6 +1025,7 @@ class MarkItDown: self.register_page_converter(Mp3Converter()) self.register_page_converter(ImageConverter()) self.register_page_converter(PdfConverter()) + self.register_page_converter(ZipConverter()) def convert( self, source: Union[str, requests.Response], **kwargs: Any @@ -1062,6 +1181,8 @@ class MarkItDown: if "mlm_model" not in _kwargs and self._mlm_model is not None: _kwargs["mlm_model"] = self._mlm_model + # Add the list of converters for nested processing + _kwargs["_parent_converters"] = self._page_converters # If we hit an error log it and keep trying try: diff --git a/tests/test_files/test_files.zip b/tests/test_files/test_files.zip new file mode 100644 index 0000000..ef49dc0 Binary files /dev/null and b/tests/test_files/test_files.zip differ diff --git a/tests/test_files/test_mskanji.csv b/tests/test_files/test_mskanji.csv new file mode 100644 index 0000000..d67f5a3 --- /dev/null +++ b/tests/test_files/test_mskanji.csv @@ -0,0 +1,4 @@ +名前,年齢,住所 +佐藤太郎,30,東京 +三木英子,25,大阪 +煖エ淳,35,名古屋 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 4922c49..5c3d79f 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -89,6 +89,13 @@ SERP_TEST_EXCLUDES = [ "data:image/svg+xml,%3Csvg%20width%3D", ] +CSV_CP932_TEST_STRINGS = [ + "蜷榊燕,蟷エ鮨「,菴乗園", + "菴占陸螟ェ驛,30,譚ア莠ャ", + "荳画惠闍ア蟄,25,螟ァ髦ェ", + "鬮呎ゥ区キウ,35,蜷榊商螻", +] + @pytest.mark.skipif( skip_remote, @@ -146,6 +153,12 @@ def test_markitdown_local() -> None: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Test ZIP file processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) + for test_string in DOCX_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test Wikipedia processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL @@ -166,6 +179,12 @@ def test_markitdown_local() -> None: for test_string in SERP_TEST_STRINGS: assert test_string in text_content + ## Test non-UTF-8 encoding + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) + text_content = result.text_content.replace("\\", "") + for test_string in CSV_CP932_TEST_STRINGS: + assert test_string in text_content + @pytest.mark.skipif( skip_exiftool,