Merge branch 'main' into main

This commit is contained in:
gagb
2024-12-16 17:24:47 -08:00
committed by GitHub
11 changed files with 296 additions and 30 deletions

1
.dockerignore Normal file
View File

@@ -0,0 +1 @@
*

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
tests/test_files/** linguist-vendored

16
Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.13-alpine
USER root
# Runtime dependency
RUN apk add --no-cache ffmpeg
RUN pip install markitdown
# Default USERID and GROUPID
ARG USERID=10000
ARG GROUPID=10000
USER $USERID:$GROUPID
ENTRYPOINT [ "markitdown" ]

View File

@@ -1,5 +1,7 @@
# MarkItDown # MarkItDown
[![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/)
The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.)
It presently supports: It presently supports:
@@ -12,6 +14,7 @@ It presently supports:
- Audio (EXIF metadata, and speech transcription) - Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.) - HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.) - Various other text-based formats (csv, json, xml, etc.)
- ZIP (Iterates over contents and converts each file)
# Installation # Installation
@@ -27,7 +30,6 @@ or from the source
pip install -e . pip install -e .
``` ```
# Usage # Usage
The API is simple: The API is simple:
@@ -39,18 +41,44 @@ result = markitdown.convert("test.xlsx")
print(result.text_content) print(result.text_content)
``` ```
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide mlm_client and mlm_model parameters to MarkItDown object, according to your specific client. To use this as a command-line utility, install it and then run it like this:
```bash
markitdown path-to-file.pdf
```
This will output Markdown to standard output. You can save it like this:
```bash
markitdown path-to-file.pdf > document.md
```
You can pipe content to standard input by omitting the argument:
```bash
cat path-to-file.pdf | markitdown
```
You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client.
```python ```python
from markitdown import MarkItDown from markitdown import MarkItDown
from openai import OpenAI from openai import OpenAI
client = OpenAI() client = OpenAI()
md = MarkItDown(mlm_client=client, mlm_model="gpt-4o") md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert("example.jpg") result = md.convert("example.jpg")
print(result.text_content) print(result.text_content)
``` ```
You can also use the project as Docker Image:
```sh
docker build -t markitdown:latest .
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
```
## Contributing ## Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a This project welcomes contributions and suggestions. Most contributions require you to agree to a

View File

@@ -38,6 +38,7 @@ dependencies = [
"youtube-transcript-api", "youtube-transcript-api",
"SpeechRecognition", "SpeechRecognition",
"pathvalidate", "pathvalidate",
"charset-normalizer",
] ]
[project.urls] [project.urls]

View File

@@ -12,8 +12,10 @@ import subprocess
import sys import sys
import tempfile import tempfile
import traceback import traceback
import zipfile
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import catch_warnings
import mammoth import mammoth
import markdownify import markdownify
@@ -26,10 +28,17 @@ import pptx
import puremagic import puremagic
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from charset_normalizer import from_path
# Optional Transcription support # Optional Transcription support
try: try:
# Using warnings' catch_warnings to catch
# pydub's warning of ffmpeg or avconv missing
with catch_warnings(record=True) as w:
import pydub import pydub
if w:
raise ModuleNotFoundError
import speech_recognition as sr import speech_recognition as sr
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
@@ -161,9 +170,7 @@ class PlainTextConverter(DocumentConverter):
elif "text/" not in content_type.lower(): elif "text/" not in content_type.lower():
return None return None
text_content = "" text_content = str(from_path(local_path).best())
with open(local_path, "rt", encoding="utf-8") as fh:
text_content = fh.read()
return DocumentConverterResult( return DocumentConverterResult(
title=None, title=None,
text_content=text_content, text_content=text_content,
@@ -344,8 +351,11 @@ class YouTubeConverter(DocumentConverter):
assert isinstance(params["v"][0], str) assert isinstance(params["v"][0], str)
video_id = str(params["v"][0]) video_id = str(params["v"][0])
try: try:
youtube_transcript_languages = kwargs.get(
"youtube_transcript_languages", ("en",)
)
# Must be a single transcript. # Must be a single transcript.
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages) # type: ignore
transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
# Alternative formatting: # Alternative formatting:
# formatter = TextFormatter() # formatter = TextFormatter()
@@ -492,7 +502,9 @@ class DocxConverter(HtmlConverter):
result = None result = None
with open(local_path, "rb") as docx_file: with open(local_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file) style_map = kwargs.get("style_map", None)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value html_content = result.value
result = self._convert(html_content) result = self._convert(html_content)
@@ -582,6 +594,10 @@ class PptxConverter(HtmlConverter):
"\n" + self._convert(html_table).text_content.strip() + "\n" "\n" + self._convert(html_table).text_content.strip() + "\n"
) )
# Charts
if shape.has_chart:
md_content += self._convert_chart_to_markdown(shape.chart)
# Text areas # Text areas
elif shape.has_text_frame: elif shape.has_text_frame:
if shape == title: if shape == title:
@@ -616,6 +632,29 @@ class PptxConverter(HtmlConverter):
return True return True
return False return False
def _convert_chart_to_markdown(self, chart):
md = "\n\n### Chart"
if chart.has_title:
md += f": {chart.chart_title.text_frame.text}"
md += "\n\n"
data = []
category_names = [c.label for c in chart.plots[0].categories]
series_names = [s.name for s in chart.series]
data.append(["Category"] + series_names)
for idx, category in enumerate(category_names):
row = [category]
for series in chart.series:
row.append(series.values[idx])
data.append(row)
markdown_table = []
for row in data:
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
header = markdown_table[0]
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
return md + "\n".join([header, separator] + markdown_table[1:])
class MediaConverter(DocumentConverter): class MediaConverter(DocumentConverter):
""" """
@@ -754,7 +793,7 @@ class Mp3Converter(WavConverter):
class ImageConverter(MediaConverter): class ImageConverter(MediaConverter):
""" """
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
""" """
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
@@ -784,17 +823,17 @@ class ImageConverter(MediaConverter):
md_content += f"{f}: {metadata[f]}\n" md_content += f"{f}: {metadata[f]}\n"
# Try describing the image with GPTV # Try describing the image with GPTV
mlm_client = kwargs.get("mlm_client") llm_client = kwargs.get("llm_client")
mlm_model = kwargs.get("mlm_model") llm_model = kwargs.get("llm_model")
if mlm_client is not None and mlm_model is not None: if llm_client is not None and llm_model is not None:
md_content += ( md_content += (
"\n# Description:\n" "\n# Description:\n"
+ self._get_mlm_description( + self._get_llm_description(
local_path, local_path,
extension, extension,
mlm_client, llm_client,
mlm_model, llm_model,
prompt=kwargs.get("mlm_prompt"), prompt=kwargs.get("llm_prompt"),
).strip() ).strip()
+ "\n" + "\n"
) )
@@ -804,11 +843,11 @@ class ImageConverter(MediaConverter):
text_content=md_content, text_content=md_content,
) )
def _get_mlm_description(self, local_path, extension, client, model, prompt=None): def _get_llm_description(self, local_path, extension, client, model, prompt=None):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."
sys.stderr.write(f"MLM Prompt:\n{prompt}\n") sys.stderr.write(f"llm Prompt:\n{prompt}\n")
data_uri = "" data_uri = ""
with open(local_path, "rb") as image_file: with open(local_path, "rb") as image_file:
@@ -837,6 +876,124 @@ class ImageConverter(MediaConverter):
return response.choices[0].message.content return response.choices[0].message.content
class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.
The converter extracts the ZIP contents to a temporary directory, processes each file
using appropriate converters based on file extensions, and then combines the results
into a single markdown document. The temporary directory is cleaned up after processing.
Example output format:
```markdown
Content from the zip file `example.zip`:
## File: docs/readme.txt
This is the content of readme.txt
Multiple lines are preserved
## File: images/example.jpg
ImageSize: 1920x1080
DateTimeOriginal: 2024-02-15 14:30:00
Description: A beautiful landscape photo
## File: data/report.xlsx
## Sheet1
| Column1 | Column2 | Column3 |
|---------|---------|---------|
| data1 | data2 | data3 |
| data4 | data5 | data6 |
```
Key features:
- Maintains original file structure in headings
- Processes nested files recursively
- Uses appropriate converters for each file type
- Preserves formatting of converted content
- Cleans up temporary files after processing
"""
def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
return None
# Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
)
extracted_zip_folder_name = (
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
)
new_folder = os.path.normpath(
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
)
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
# Safety check for path traversal
if not new_folder.startswith(os.path.dirname(local_path)):
return DocumentConverterResult(
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
)
try:
# Extract the zip file
with zipfile.ZipFile(local_path, "r") as zipObj:
zipObj.extractall(path=new_folder)
# Process each extracted file
for root, dirs, files in os.walk(new_folder):
for name in files:
file_path = os.path.join(root, name)
relative_path = os.path.relpath(file_path, new_folder)
# Get file extension
_, file_extension = os.path.splitext(name)
# Update kwargs for the file
file_kwargs = kwargs.copy()
file_kwargs["file_extension"] = file_extension
file_kwargs["_parent_converters"] = parent_converters
# Try converting the file using available converters
for converter in parent_converters:
# Skip the zip converter to avoid infinite recursion
if isinstance(converter, ZipConverter):
continue
result = converter.convert(file_path, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"
break
# Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True):
shutil.rmtree(new_folder)
return DocumentConverterResult(title=None, text_content=md_content.strip())
except zipfile.BadZipFile:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
)
except Exception as e:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
)
class FileConversionException(BaseException): class FileConversionException(BaseException):
pass pass
@@ -852,16 +1009,18 @@ class MarkItDown:
def __init__( def __init__(
self, self,
requests_session: Optional[requests.Session] = None, requests_session: Optional[requests.Session] = None,
mlm_client: Optional[Any] = None, llm_client: Optional[Any] = None,
mlm_model: Optional[Any] = None, llm_model: Optional[Any] = None,
style_map: Optional[str] = None,
): ):
if requests_session is None: if requests_session is None:
self._requests_session = requests.Session() self._requests_session = requests.Session()
else: else:
self._requests_session = requests_session self._requests_session = requests_session
self._mlm_client = mlm_client self._llm_client = llm_client
self._mlm_model = mlm_model self._llm_model = llm_model
self._style_map = style_map
self._page_converters: List[DocumentConverter] = [] self._page_converters: List[DocumentConverter] = []
@@ -880,6 +1039,7 @@ class MarkItDown:
self.register_page_converter(Mp3Converter()) self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter()) self.register_page_converter(ImageConverter())
self.register_page_converter(PdfConverter()) self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
def convert( def convert(
self, source: Union[str, requests.Response], **kwargs: Any self, source: Union[str, requests.Response], **kwargs: Any
@@ -1003,7 +1163,7 @@ class MarkItDown:
self._append_ext(extensions, g) self._append_ext(extensions, g)
# Convert # Convert
result = self._convert(temp_path, extensions, url=response.url) result = self._convert(temp_path, extensions, url=response.url, **kwargs)
# Clean up # Clean up
finally: finally:
try: try:
@@ -1030,11 +1190,17 @@ class MarkItDown:
_kwargs.update({"file_extension": ext}) _kwargs.update({"file_extension": ext})
# Copy any additional global options # Copy any additional global options
if "mlm_client" not in _kwargs and self._mlm_client is not None: if "llm_client" not in _kwargs and self._llm_client is not None:
_kwargs["mlm_client"] = self._mlm_client _kwargs["llm_client"] = self._llm_client
if "mlm_model" not in _kwargs and self._mlm_model is not None: if "llm_model" not in _kwargs and self._llm_model is not None:
_kwargs["mlm_model"] = self._mlm_model _kwargs["llm_model"] = self._llm_model
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
# If we hit an error log it and keep trying # If we hit an error log it and keep trying
try: try:
@@ -1071,7 +1237,6 @@ class MarkItDown:
if ext == "": if ext == "":
return return
# if ext not in extensions: # if ext not in extensions:
if True:
extensions.append(ext) extensions.append(ext)
def _guess_ext_magic(self, path): def _guess_ext_magic(self, path):

BIN
tests/test_files/test.pptx vendored Executable file → Normal file

Binary file not shown.

BIN
tests/test_files/test_files.zip vendored Normal file

Binary file not shown.

4
tests/test_files/test_mskanji.csv vendored Normal file
View File

@@ -0,0 +1,4 @@
<EFBFBD><EFBFBD><EFBFBD>O,<EFBFBD>N<EFBFBD><EFBFBD>,<EFBFBD>Z<EFBFBD><EFBFBD>
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Y,30,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD>O<EFBFBD>؉p<EFBFBD>q,25,<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>~,35,<EFBFBD><EFBFBD><EFBFBD>É<EFBFBD>
1 –¼‘O ”N—î �Z�Š
2 �²“¡‘¾˜Y 30 “Œ‹ž
3 ŽO–؉pŽq 25 ‘å�ã
4 îà‹´�~ 35 –¼ŒÃ‰®

BIN
tests/test_files/test_with_comment.docx vendored Executable file

Binary file not shown.

View File

@@ -51,12 +51,25 @@ DOCX_TEST_STRINGS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
] ]
DOCX_COMMENT_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"This is a test comment. 12df-321a",
"Yet another comment in the doc. 55yiyi-asd09",
]
PPTX_TEST_STRINGS = [ PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455", "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12", "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592", "1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
] ]
BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
@@ -87,6 +100,13 @@ SERP_TEST_EXCLUDES = [
"data:image/svg+xml,%3Csvg%20width%3D", "data:image/svg+xml,%3Csvg%20width%3D",
] ]
CSV_CP932_TEST_STRINGS = [
"名前,年齢,住所",
"佐藤太郎,30,東京",
"三木英子,25,大阪",
"髙橋淳,35,名古屋",
]
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
@@ -130,6 +150,24 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
# Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
style_map="comment-reference => ",
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test PPTX processing # Test PPTX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
for test_string in PPTX_TEST_STRINGS: for test_string in PPTX_TEST_STRINGS:
@@ -144,6 +182,12 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
assert test_string in text_content assert test_string in text_content
# Test ZIP file processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test Wikipedia processing # Test Wikipedia processing
result = markitdown.convert( result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
@@ -164,6 +208,12 @@ def test_markitdown_local() -> None:
for test_string in SERP_TEST_STRINGS: for test_string in SERP_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
text_content = result.text_content.replace("\\", "")
for test_string in CSV_CP932_TEST_STRINGS:
assert test_string in text_content
@pytest.mark.skipif( @pytest.mark.skipif(
skip_exiftool, skip_exiftool,