Merge branch 'main' into main

This commit is contained in:
gagb
2024-12-16 16:35:50 -08:00
committed by GitHub
9 changed files with 251 additions and 9 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
tests/test_files/** linguist-vendored

View File

@@ -1,5 +1,7 @@
# MarkItDown # MarkItDown
[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
It presently supports: It presently supports:
@@ -12,6 +14,7 @@ It presently supports:
- Audio (EXIF metadata, and speech transcription) - Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.) - HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.) - Various other text-based formats (csv, json, xml, etc.)
- ZIP (Iterates over contents and converts each file)
# Installation # Installation
@@ -27,7 +30,6 @@ or from the source
pip install -e . pip install -e .
``` ```
# Usage # Usage
The API is simple: The API is simple:
@@ -39,7 +41,26 @@ result = markitdown.convert("test.xlsx")
print(result.text_content) print(result.text_content)
``` ```
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. To use this as a command-line utility, install it and then run it like this:
```bash
markitdown path-to-file.pdf
```
This will output Markdown to standard output. You can save it like this:
```bash
markitdown path-to-file.pdf > document.md
```
You can pipe content to standard input by omitting the argument:
```bash
cat path-to-file.pdf | markitdown
```
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `mlm_client` and `mlm_model` parameters to MarkItDown object, according to your specific client.
```python ```python
from markitdown import MarkItDown from markitdown import MarkItDown

View File

@@ -38,6 +38,7 @@ dependencies = [
"youtube-transcript-api", "youtube-transcript-api",
"SpeechRecognition", "SpeechRecognition",
"pathvalidate", "pathvalidate",
"charset-normalizer",
] ]
[project.urls] [project.urls]

View File

@@ -12,8 +12,10 @@ import subprocess
import sys import sys
import tempfile import tempfile
import traceback import traceback
import zipfile
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import catch_warnings
import mammoth import mammoth
import markdownify import markdownify
@@ -26,10 +28,17 @@ import pptx
import puremagic import puremagic
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Optional Transcription support # Optional Transcription support
try: try:
import pydub # Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
@@ -161,9 +170,7 @@ class PlainTextConverter(DocumentConverter):
elif "text/" not in content_type.lower(): elif "text/" not in content_type.lower():
return None return None
text_content = "" text_content = str(from_path(local_path).best())
with open(local_path, "rt", encoding="utf-8") as fh:
text_content = fh.read()
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=text_content, text_content=text_content,
@@ -344,8 +351,11 @@ class YouTubeConverter(DocumentConverter):
assert isinstance(params["v"][0], str) assert isinstance(params["v"][0], str)
video_id = str(params["v"][0]) video_id = str(params["v"][0])
try: try:
youtube_transcript_languages = kwargs.get(
"youtube_transcript_languages", ("en",)
)
# Must be a single transcript. # Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
# Alternative formatting: # Alternative formatting:
# formatter = TextFormatter() # formatter = TextFormatter()
@@ -492,7 +502,9 @@ class DocxConverter(HtmlConverter):
result = None result = None
with open(local_path, "rb") as docx_file: with open(local_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file) style_map = kwargs.get("style_map", None)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value html_content = result.value
result = self._convert(html_content) result = self._convert(html_content)
@@ -582,6 +594,10 @@ class PptxConverter(HtmlConverter):
"\n" + self._convert(html_table).text_content.strip() + "\n" "\n" + self._convert(html_table).text_content.strip() + "\n"
) )
# Charts
if shape.has_chart:
md_content += self._convert_chart_to_markdown(shape.chart)
# Text areas # Text areas
elif shape.has_text_frame: elif shape.has_text_frame:
if shape == title: if shape == title:
@@ -616,6 +632,29 @@ class PptxConverter(HtmlConverter):
return True return True
return False return False
def _convert_chart_to_markdown(self, chart):
md = "\n\n### Chart"
if chart.has_title:
md += f": {chart.chart_title.text_frame.text}"
md += "\n\n"
data = []
category_names = [c.label for c in chart.plots[0].categories]
series_names = [s.name for s in chart.series]
data.append(["Category"] + series_names)
for idx, category in enumerate(category_names):
row = [category]
for series in chart.series:
row.append(series.values[idx])
data.append(row)
markdown_table = []
for row in data:
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
header = markdown_table[0]
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
return md + "\n".join([header, separator] + markdown_table[1:])
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):
""" """
@@ -837,6 +876,124 @@ class ImageConverter(MediaConverter):
return response.choices[0].message.content return response.choices[0].message.content
class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.
The converter extracts the ZIP contents to a temporary directory, processes each file
using appropriate converters based on file extensions, and then combines the results
into a single markdown document. The temporary directory is cleaned up after processing.
Example output format:
```markdown
Content from the zip file `example.zip`:
## File: docs/readme.txt
This is the content of readme.txt
Multiple lines are preserved
## File: images/example.jpg
ImageSize: 1920x1080
DateTimeOriginal: 2024-02-15 14:30:00
Description: A beautiful landscape photo
## File: data/report.xlsx
## Sheet1
| Column1 | Column2 | Column3 |
|---------|---------|---------|
| data1 | data2 | data3 |
| data4 | data5 | data6 |
```
Key features:
- Maintains original file structure in headings
- Processes nested files recursively
- Uses appropriate converters for each file type
- Preserves formatting of converted content
- Cleans up temporary files after processing
"""
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
return None
# Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
)
extracted_zip_folder_name = (
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
)
new_folder = os.path.normpath(
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
)
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
# Safety check for path traversal
if not new_folder.startswith(os.path.dirname(local_path)):
return DocumentConverterResult(
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
)
try:
# Extract the zip file
with zipfile.ZipFile(local_path, "r") as zipObj:
zipObj.extractall(path=new_folder)
# Process each extracted file
for root, dirs, files in os.walk(new_folder):
for name in files:
file_path = os.path.join(root, name)
relative_path = os.path.relpath(file_path, new_folder)
# Get file extension
_, file_extension = os.path.splitext(name)
# Update kwargs for the file
file_kwargs = kwargs.copy()
file_kwargs["file_extension"] = file_extension
file_kwargs["_parent_converters"] = parent_converters
# Try converting the file using available converters
for converter in parent_converters:
# Skip the zip converter to avoid infinite recursion
if isinstance(converter, ZipConverter):
continue
result = converter.convert(file_path, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"
break
# Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True):
shutil.rmtree(new_folder)
return DocumentConverterResult(title=None, text_content=md_content.strip())
except zipfile.BadZipFile:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
)
except Exception as e:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
)
class FileConversionException(BaseException): class FileConversionException(BaseException):
pass pass
@@ -854,6 +1011,7 @@ class MarkItDown:
requests_session: Optional[requests.Session] = None, requests_session: Optional[requests.Session] = None,
mlm_client: Optional[Any] = None, mlm_client: Optional[Any] = None,
mlm_model: Optional[Any] = None, mlm_model: Optional[Any] = None,
style_map: Optional[str] = None,
): ):
if requests_session is None: if requests_session is None:
self._requests_session = requests.Session() self._requests_session = requests.Session()
@@ -862,6 +1020,7 @@ class MarkItDown:
self._mlm_client = mlm_client self._mlm_client = mlm_client
self._mlm_model = mlm_model self._mlm_model = mlm_model
self._style_map = style_map
self._page_converters: List[DocumentConverter] = [] self._page_converters: List[DocumentConverter] = []
@@ -880,6 +1039,7 @@ class MarkItDown:
self.register_page_converter(Mp3Converter()) self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter()) self.register_page_converter(ImageConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response], **kwargs: Any
@@ -1003,7 +1163,7 @@ class MarkItDown:
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Convert # Convert
result = self._convert(temp_path, extensions, url=response.url) result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up # Clean up
finally: finally:
try: try:
@@ -1035,6 +1195,11 @@ class MarkItDown:
if "mlm_model" not in _kwargs and self._mlm_model is not None: if "mlm_model" not in _kwargs and self._mlm_model is not None:
_kwargs["mlm_model"] = self._mlm_model _kwargs["mlm_model"] = self._mlm_model
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
try: try:

BIN
tests/test_files/test.pptx vendored Executable file → Normal file

Binary file not shown.

BIN
tests/test_files/test_files.zip vendored Normal file

Binary file not shown.

4
tests/test_files/test_mskanji.csv vendored Normal file
View File

@@ -0,0 +1,4 @@
<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
1 –¼‘O ”N—î �Z�Š
2 �²“¡‘¾˜Y 30 “Œ‹ž
3 ŽO–؉pŽq 25 ‘å�ã
4 îà‹´�~ 35 –¼ŒÃ‰®

BIN
tests/test_files/test_with_comment.docx vendored Executable file

Binary file not shown.

View File

@@ -51,12 +51,25 @@ DOCX_TEST_STRINGS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
] ]
DOCX_COMMENT_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"This is a test comment. 12df-321a",
"Yet another comment in the doc. 55yiyi-asd09",
]
PPTX_TEST_STRINGS = [ PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455", "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592", "1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
] ]
BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@@ -87,6 +100,13 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D", "data:image/svg+xml,%3Csvg%20width%3D",
] ]
CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
@@ -130,6 +150,24 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
# Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
style_map="comment-reference => ",
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test PPTX processing # Test PPTX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
for test_string in PPTX_TEST_STRINGS: for test_string in PPTX_TEST_STRINGS:
@@ -144,6 +182,12 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
# Test ZIP file processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test Wikipedia processing # Test Wikipedia processing
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
@@ -164,6 +208,12 @@ def test_markitdown_local() -> None:
for test_string in SERP_TEST_STRINGS: for test_string in SERP_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
text_content = result.text_content.replace("\\", "")
for test_string in CSV_CP932_TEST_STRINGS:
assert test_string in text_content
@pytest.mark.skipif( @pytest.mark.skipif(
skip_exiftool, skip_exiftool,