diff --git a/README.md b/README.md index df7189d..f0f2e83 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # MarkItDown +[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) + The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) It presently supports: @@ -12,6 +14,7 @@ It presently supports: - Audio (EXIF metadata, and speech transcription) - HTML (special handling of Wikipedia, etc.) - Various other text-based formats (csv, json, xml, etc.) +- ZIP (Iterates over contents and converts each file) # Installation @@ -27,7 +30,6 @@ or from the source pip install -e . ``` - # Usage The API is simple: @@ -39,6 +41,25 @@ result = markitdown.convert("test.xlsx") print(result.text_content) ``` +To use this as a command-line utility, install it and then run it like this: + +```bash +markitdown path-to-file.pdf +``` + +This will output Markdown to standard output. You can save it like this: + +```bash +markitdown path-to-file.pdf > document.md +``` + +You can pipe content to standard input by omitting the argument: + +```bash +cat path-to-file.pdf | markitdown +``` + + You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. ```python diff --git a/pyproject.toml b/pyproject.toml index 74df032..756380a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "youtube-transcript-api", "SpeechRecognition", "pathvalidate", + "charset-normalizer", ] [project.urls] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 5789679..9517a32 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -12,6 +12,7 @@ import subprocess import sys import tempfile import traceback +import zipfile from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import catch_warnings @@ -27,6 +28,7 @@ import pptx import puremagic import requests from bs4 import BeautifulSoup +from charset_normalizer import from_path # Optional Transcription support try: @@ -168,9 +170,7 @@ class PlainTextConverter(DocumentConverter): elif "text/" not in content_type.lower(): return None - text_content = "" - with open(local_path, "rt", encoding="utf-8") as fh: - text_content = fh.read() + text_content = str(from_path(local_path).best()) return DocumentConverterResult( title=None, text_content=text_content, @@ -499,7 +499,9 @@ class DocxConverter(HtmlConverter): result = None with open(local_path, "rb") as docx_file: - result = mammoth.convert_to_html(docx_file) + style_map = kwargs.get("style_map", None) + + result = mammoth.convert_to_html(docx_file, style_map=style_map) html_content = result.value result = self._convert(html_content) @@ -589,6 +591,10 @@ class PptxConverter(HtmlConverter): "\n" + self._convert(html_table).text_content.strip() + "\n" ) + # Charts + if shape.has_chart: + md_content += self._convert_chart_to_markdown(shape.chart) + # Text areas elif shape.has_text_frame: if shape == title: @@ -623,6 +629,29 @@ class PptxConverter(HtmlConverter): return True return False + def _convert_chart_to_markdown(self, chart): + md = "\n\n### Chart" + if chart.has_title: + md += f": {chart.chart_title.text_frame.text}" + md += "\n\n" + data = [] + category_names = [c.label for c in chart.plots[0].categories] + series_names = [s.name for s in chart.series] + data.append(["Category"] + series_names) + + for idx, category in enumerate(category_names): + row = [category] + for series in chart.series: + row.append(series.values[idx]) + data.append(row) + + markdown_table = [] + for row in data: + markdown_table.append("| " + " | ".join(map(str, row)) + " |") + header = markdown_table[0] + separator = "|" + "|".join(["---"] * len(data[0])) + "|" + return md + "\n".join([header, separator] + markdown_table[1:]) + class MediaConverter(DocumentConverter): """ @@ -844,6 +873,124 @@ class ImageConverter(MediaConverter): return response.choices[0].message.content +class ZipConverter(DocumentConverter): + """Converts ZIP files to markdown by extracting and converting all contained files. + + The converter extracts the ZIP contents to a temporary directory, processes each file + using appropriate converters based on file extensions, and then combines the results + into a single markdown document. The temporary directory is cleaned up after processing. + + Example output format: + ```markdown + Content from the zip file `example.zip`: + + ## File: docs/readme.txt + + This is the content of readme.txt + Multiple lines are preserved + + ## File: images/example.jpg + + ImageSize: 1920x1080 + DateTimeOriginal: 2024-02-15 14:30:00 + Description: A beautiful landscape photo + + ## File: data/report.xlsx + + ## Sheet1 + | Column1 | Column2 | Column3 | + |---------|---------|---------| + | data1 | data2 | data3 | + | data4 | data5 | data6 | + ``` + + Key features: + - Maintains original file structure in headings + - Processes nested files recursively + - Uses appropriate converters for each file type + - Preserves formatting of converted content + - Cleans up temporary files after processing + """ + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not a ZIP + extension = kwargs.get("file_extension", "") + if extension.lower() != ".zip": + return None + + # Get parent converters list if available + parent_converters = kwargs.get("_parent_converters", []) + if not parent_converters: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] No converters available to process zip contents from: {local_path}", + ) + + extracted_zip_folder_name = ( + f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" + ) + new_folder = os.path.normpath( + os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) + ) + md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" + + # Safety check for path traversal + if not new_folder.startswith(os.path.dirname(local_path)): + return DocumentConverterResult( + title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}" + ) + + try: + # Extract the zip file + with zipfile.ZipFile(local_path, "r") as zipObj: + zipObj.extractall(path=new_folder) + + # Process each extracted file + for root, dirs, files in os.walk(new_folder): + for name in files: + file_path = os.path.join(root, name) + relative_path = os.path.relpath(file_path, new_folder) + + # Get file extension + _, file_extension = os.path.splitext(name) + + # Update kwargs for the file + file_kwargs = kwargs.copy() + file_kwargs["file_extension"] = file_extension + file_kwargs["_parent_converters"] = parent_converters + + # Try converting the file using available converters + for converter in parent_converters: + # Skip the zip converter to avoid infinite recursion + if isinstance(converter, ZipConverter): + continue + + result = converter.convert(file_path, **file_kwargs) + if result is not None: + md_content += f"\n## File: {relative_path}\n\n" + md_content += result.text_content + "\n\n" + break + + # Clean up extracted files if specified + if kwargs.get("cleanup_extracted", True): + shutil.rmtree(new_folder) + + return DocumentConverterResult(title=None, text_content=md_content.strip()) + + except zipfile.BadZipFile: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", + ) + except Exception as e: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", + ) + + class FileConversionException(BaseException): pass @@ -861,6 +1008,7 @@ class MarkItDown: requests_session: Optional[requests.Session] = None, mlm_client: Optional[Any] = None, mlm_model: Optional[Any] = None, + style_map: Optional[str] = None, ): if requests_session is None: self._requests_session = requests.Session() @@ -869,6 +1017,7 @@ class MarkItDown: self._mlm_client = mlm_client self._mlm_model = mlm_model + self._style_map = style_map self._page_converters: List[DocumentConverter] = [] @@ -887,6 +1036,7 @@ class MarkItDown: self.register_page_converter(Mp3Converter()) self.register_page_converter(ImageConverter()) self.register_page_converter(PdfConverter()) + self.register_page_converter(ZipConverter()) def convert( self, source: Union[str, requests.Response], **kwargs: Any @@ -1042,6 +1192,11 @@ class MarkItDown: if "mlm_model" not in _kwargs and self._mlm_model is not None: _kwargs["mlm_model"] = self._mlm_model + # Add the list of converters for nested processing + _kwargs["_parent_converters"] = self._page_converters + + if "style_map" not in _kwargs and self._style_map is not None: + _kwargs["style_map"] = self._style_map # If we hit an error log it and keep trying try: diff --git a/tests/test_files/test.pptx b/tests/test_files/test.pptx old mode 100755 new mode 100644 index 35eabf4..ea1bbcb Binary files a/tests/test_files/test.pptx and b/tests/test_files/test.pptx differ diff --git a/tests/test_files/test_files.zip b/tests/test_files/test_files.zip new file mode 100644 index 0000000..ef49dc0 Binary files /dev/null and b/tests/test_files/test_files.zip differ diff --git a/tests/test_files/test_mskanji.csv b/tests/test_files/test_mskanji.csv new file mode 100644 index 0000000..d67f5a3 --- /dev/null +++ b/tests/test_files/test_mskanji.csv @@ -0,0 +1,4 @@ +名前,年齢,住所 +佐藤太郎,30,東京 +三木英子,25,大阪 +煖エ淳,35,名古屋 diff --git a/tests/test_files/test_with_comment.docx b/tests/test_files/test_with_comment.docx new file mode 100755 index 0000000..8fc1745 Binary files /dev/null and b/tests/test_files/test_with_comment.docx differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 94fd886..76bd302 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -51,12 +51,25 @@ DOCX_TEST_STRINGS = [ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", ] +DOCX_COMMENT_TEST_STRINGS = [ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "This is a test comment. 12df-321a", + "Yet another comment in the doc. 55yiyi-asd09", +] + PPTX_TEST_STRINGS = [ "2cdda5c8-e50e-4db4-b5f0-9722a649f455", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "1b92870d-e3b5-4e65-8153-919f4ff45592", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title + "2003", # chart value ] BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" @@ -87,6 +100,13 @@ SERP_TEST_EXCLUDES = [ "data:image/svg+xml,%3Csvg%20width%3D", ] +CSV_CP932_TEST_STRINGS = [ + "蜷榊燕,蟷エ鮨「,菴乗園", + "菴占陸螟ェ驛,30,譚ア莠ャ", + "荳画惠闍ア蟄,25,螟ァ髦ェ", + "鬮呎ゥ区キウ,35,蜷榊商螻", +] + @pytest.mark.skipif( skip_remote, @@ -130,6 +150,24 @@ def test_markitdown_local() -> None: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Test DOCX processing, with comments + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), + style_map="comment-reference => ", + ) + for test_string in DOCX_COMMENT_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + + # Test DOCX processing, with comments and setting style_map on init + markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") + result = markitdown_with_style_map.convert( + os.path.join(TEST_FILES_DIR, "test_with_comment.docx") + ) + for test_string in DOCX_COMMENT_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test PPTX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) for test_string in PPTX_TEST_STRINGS: @@ -144,6 +182,12 @@ def test_markitdown_local() -> None: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Test ZIP file processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) + for test_string in DOCX_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test Wikipedia processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL @@ -164,6 +208,12 @@ def test_markitdown_local() -> None: for test_string in SERP_TEST_STRINGS: assert test_string in text_content + ## Test non-UTF-8 encoding + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) + text_content = result.text_content.replace("\\", "") + for test_string in CSV_CP932_TEST_STRINGS: + assert test_string in text_content + @pytest.mark.skipif( skip_exiftool,