12 Commits

Author SHA1 Message Date
afourney
2ffe6ea591 Bump version. (#1150) 2025-03-22 11:21:32 -07:00
afourney
efc55b260d Bump version and resolve a console encoding error. (#1149) 2025-03-21 09:27:25 -07:00
Yuzhong Zhang
52432bd228 Add support for preserving base64 encoded images (#1140)
* optional reserve base64 string in markdown _CustomMarkdownify and pptx
* add other converter para support
* fix linter
* Use *kwarg to pass keep_data_uri para.
* Add module cli vector tests
* Fixed formatting, and adjusted tests.
2025-03-20 18:50:23 -07:00
afourney
c0a511ecff Updated docx file to include an image. (#1146) 2025-03-20 12:25:56 -07:00
afourney
cd6aa41361 Adjust warning filters and update dependencies (#1143)
Adjusts warning filters to be more contextual
Updates dependencies for magika and youtube-transcript-api
Updates the version to 0.1.0a5 in __about__.py
2025-03-19 22:09:14 -07:00
afourney
716f74dcb9 Consider anything with a charset as plain text-convertible. (#1142) 2025-03-19 20:46:35 -07:00
afourney
a93e0567e6 EPub Support. Adapted #123 to not use epublib. (#1131)
* Adapted #123 to not use epublib.
* Updated README.md
2025-03-17 07:48:15 -07:00
afourney
c5f70b904f Have magika read from the stream. (#1136) 2025-03-17 07:39:19 -07:00
afourney
53834fdd24 Investigate and silence warnings. (#1133) 2025-03-15 23:41:35 -07:00
afourney
5c565b7d79 Fix remaining mypy errors. (#1132) 2025-03-15 23:12:48 -07:00
afourney
a78857bd43 Added epub test file. (#1130) 2025-03-15 18:34:51 -07:00
afourney
09df7fe8df Small fixes for autogen integration. (#1124) 2025-03-12 19:18:11 -07:00
24 changed files with 490 additions and 93 deletions

View File

@@ -6,7 +6,8 @@
> [!IMPORTANT]
> Breaking changes between 0.0.1 to 0.1.0:
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior.
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior.
> * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
@@ -14,7 +15,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports:
- PDF
- PowerPoint (reading in top-to-bottom, left-to-right order)
- PowerPoint
- Word
- Excel
- Images (EXIF metadata and OCR)
@@ -23,6 +24,7 @@ At present, MarkItDown supports:
- Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents)
- Youtube URLs
- EPubs
- ... and more!
## Why Markdown?
@@ -36,7 +38,7 @@ are also highly token-efficient.
## Installation
To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
```bash
git clone git@github.com:microsoft/markitdown.git

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4",
"requests",
"markdownify",
"magika>=0.6.1rc2",
"magika~=0.6.1",
"charset-normalizer",
]
@@ -42,7 +42,7 @@ all = [
"olefile",
"pydub",
"SpeechRecognition",
"youtube-transcript-api",
"youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence",
"azure-identity"
]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.0a2"
__version__ = "0.1.0"

View File

@@ -4,6 +4,7 @@
import argparse
import sys
import codecs
import locale
from textwrap import dedent
from importlib.metadata import entry_points
from .__about__ import __version__
@@ -104,6 +105,12 @@ def main():
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
)
parser.add_argument(
"--keep-data-uris",
action="store_true",
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
@@ -139,7 +146,7 @@ def main():
else:
charset_hint = None
stream_info: str | None = None
stream_info = None
if (
extension_hint is not None
or mime_type_hint is not None
@@ -181,9 +188,15 @@ def main():
markitdown = MarkItDown(enable_plugins=args.use_plugins)
if args.filename is None:
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
)
else:
result = markitdown.convert(args.filename, stream_info=stream_info)
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
)
_handle_output(args, result)
@@ -192,9 +205,14 @@ def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.text_content)
f.write(result.markdown)
else:
print(result.text_content)
# Handle stdout encoding errors more gracefully
print(
result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
sys.stdout.encoding
)
)
def _exit_with_error(message: str):

View File

@@ -38,6 +38,7 @@ from .converters import (
AudioConverter,
OutlookMsgConverter,
ZipConverter,
EpubConverter,
DocumentIntelligenceConverter,
)
@@ -191,6 +192,7 @@ class MarkItDown:
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
@@ -610,14 +612,16 @@ class MarkItDown:
# Call magika to guess from the stream
cur_pos = file_stream.tell()
try:
stream_bytes = file_stream.read()
result = self._magika.identify_bytes(stream_bytes)
result = self._magika.identify_stream(file_stream)
if result.status == "ok" and result.prediction.output.label != "unknown":
# If it's text, also guess the charset
charset = None
if result.prediction.output.is_text:
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
# Read the first 4k to guess the charset
file_stream.seek(cur_pos)
stream_page = file_stream.read(4096)
charset_result = charset_normalizer.from_bytes(stream_page).best()
if charset_result is not None:
charset = self._normalize_charset(charset_result.encoding)

View File

@@ -18,6 +18,7 @@ from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter
from ._epub_converter import EpubConverter
__all__ = [
"PlainTextConverter",
@@ -37,4 +38,5 @@ __all__ = [
"OutlookMsgConverter",
"ZipConverter",
"DocumentIntelligenceConverter",
"EpubConverter",
]

View File

@@ -1,6 +1,7 @@
import io
import re
import base64
import binascii
from urllib.parse import parse_qs, urlparse
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
@@ -60,6 +61,8 @@ class BingSerpConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
assert stream_info.url is not None
# Parse the query parameters
parsed_params = parse_qs(urlparse(stream_info.url).query)
query = parsed_params.get("q", [""])[0]
@@ -76,9 +79,12 @@ class BingSerpConverter(DocumentConverter):
slug.extract()
# Parse the algorithmic results
_markdownify = _CustomMarkdownify()
_markdownify = _CustomMarkdownify(**kwargs)
results = list()
for result in soup.find_all(class_="b_algo"):
if not hasattr(result, "find_all"):
continue
# Rewrite redirect urls
for a in result.find_all("a", href=True):
parsed_href = urlparse(a["href"])

View File

@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None)
return self._html_converter.convert_string(
mammoth.convert_to_html(file_stream, style_map=style_map).value
mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
)

View File

@@ -0,0 +1,147 @@
import os
import zipfile
import xml.dom.minidom as minidom
from typing import BinaryIO, Any, Dict, List
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/epub",
"application/epub+zip",
"application/x-epub+zip",
]
ACCEPTED_FILE_EXTENSIONS = [".epub"]
MIME_TYPE_MAPPING = {
".html": "text/html",
".xhtml": "application/xhtml+xml",
}
class EpubConverter(HtmlConverter):
"""
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
with zipfile.ZipFile(file_stream, "r") as z:
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
# Locate content.opf
container_dom = minidom.parse(z.open("META-INF/container.xml"))
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
"full-path"
)
# Parse content.opf
opf_dom = minidom.parse(z.open(opf_path))
metadata: Dict[str, Any] = {
"title": self._get_text_from_node(opf_dom, "dc:title"),
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
"language": self._get_text_from_node(opf_dom, "dc:language"),
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
"date": self._get_text_from_node(opf_dom, "dc:date"),
"description": self._get_text_from_node(opf_dom, "dc:description"),
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
}
# Extract manifest items (ID → href mapping)
manifest = {
item.getAttribute("id"): item.getAttribute("href")
for item in opf_dom.getElementsByTagName("item")
}
# Extract spine order (ID refs)
spine_items = opf_dom.getElementsByTagName("itemref")
spine_order = [item.getAttribute("idref") for item in spine_items]
# Convert spine order to actual file paths
base_path = "/".join(
opf_path.split("/")[:-1]
) # Get base directory of content.opf
spine = [
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
for item_id in spine_order
if item_id in manifest
]
# Extract and convert the content
markdown_content: List[str] = []
for file in spine:
if file in z.namelist():
with z.open(file) as f:
filename = os.path.basename(file)
extension = os.path.splitext(filename)[1].lower()
mimetype = MIME_TYPE_MAPPING.get(extension)
converted_content = self._html_converter.convert(
f,
StreamInfo(
mimetype=mimetype,
extension=extension,
filename=filename,
),
)
markdown_content.append(converted_content.markdown.strip())
# Format and add the metadata
metadata_markdown = []
for key, value in metadata.items():
if isinstance(value, list):
value = ", ".join(value)
if value:
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
markdown_content.insert(0, "\n".join(metadata_markdown))
return DocumentConverterResult(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
if len(texts) > 0:
return texts[0]
else:
return None
def _get_all_texts_from_nodes(
self, dom: minidom.Document, tag_name: str
) -> List[str]:
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
texts: List[str] = []
for node in dom.getElementsByTagName(tag_name):
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
texts.append(node.firstChild.nodeValue.strip())
return texts

View File

@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
assert isinstance(webpage_text, str)

View File

@@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
@@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return alt
# Remove dataURIs
if src.startswith("data:"):
if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."
return "![%s](%s%s)" % (alt, src, title_part)

View File

@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
_dependency_exc_info = None
olefile = None
try:
import olefile
import olefile # type: ignore[no-redef]
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
@@ -56,6 +56,7 @@ class OutlookMsgConverter(DocumentConverter):
# Brue force, check if it's an Outlook file
try:
if olefile is not None:
msg = olefile.OleFileIO(file_stream)
toc = "\n".join([str(stream) for stream in msg.listdir()])
return (
@@ -89,7 +90,11 @@ class OutlookMsgConverter(DocumentConverter):
_dependency_exc_info[2]
)
assert (
olefile is not None
) # If we made it this far, olefile should be available
msg = olefile.OleFileIO(file_stream)
# Extract email metadata
md_content = "# Email Message\n\n"
@@ -121,6 +126,7 @@ class OutlookMsgConverter(DocumentConverter):
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file."""
assert olefile is not None
assert isinstance(
msg, olefile.OleFileIO
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)

View File

@@ -17,12 +17,16 @@ except ImportError:
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/",
"application/json",
"application/markdown",
]
# Mimetypes to ignore (commonly confused extensions)
IGNORE_MIME_TYPE_PREFIXES = [
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
ACCEPTED_FILE_EXTENSIONS = [
".txt",
".text",
".md",
".markdown",
".json",
".jsonl",
]
@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
for prefix in IGNORE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return False
# If we have a charset, we can safely assume it's text
# With Magika in the earlier stages, this handles most cases
if stream_info.charset is not None:
return True
# Otherwise, check the mimetype and extension
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):

View File

@@ -140,13 +140,20 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
alt_text = re.sub(r"\s+", " ", alt_text).strip()
# If keep_data_uris is True, use base64 encoding for images
if kwargs.get("keep_data_uris", False):
blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
else:
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += "\n![" + alt_text + "](" + filename + ")\n"
# Tables
if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table)
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
# Charts
if shape.has_chart:
@@ -193,7 +200,7 @@ class PptxConverter(DocumentConverter):
return True
return False
def _convert_table_to_markdown(self, table):
def _convert_table_to_markdown(self, table, **kwargs):
# Write the table as HTML, then convert it to Markdown
html_table = "<html><body><table>"
first_row = True
@@ -208,7 +215,10 @@ class PptxConverter(DocumentConverter):
first_row = False
html_table += "</table></body></html>"
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+ "\n"
)
def _convert_chart_to_markdown(self, chart):
try:

View File

@@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def __init__(self):
super().__init__()
self._kwargs = {}
def accepts(
self,
file_stream: BinaryIO,
@@ -66,7 +70,7 @@ class RssConverter(DocumentConverter):
file_stream.seek(cur_pos)
return False
def _feed_type(self, doc: Any) -> str:
def _feed_type(self, doc: Any) -> str | None:
if doc.getElementsByTagName("rss"):
return "rss"
elif doc.getElementsByTagName("feed"):
@@ -82,6 +86,7 @@ class RssConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
self._kwargs = kwargs
doc = minidom.parse(file_stream)
feed_type = self._feed_type(doc)
@@ -130,10 +135,10 @@ class RssConverter(DocumentConverter):
Returns None if the feed type is not recognized or something goes wrong.
"""
root = doc.getElementsByTagName("rss")[0]
channel = root.getElementsByTagName("channel")
if not channel:
return None
channel = channel[0]
channel_list = root.getElementsByTagName("channel")
if not channel_list:
raise ValueError("No channel found in RSS feed")
channel = channel_list[0]
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
@@ -141,8 +146,6 @@ class RssConverter(DocumentConverter):
md_text = f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
if not items:
items = []
for item in items:
title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description")
@@ -168,7 +171,7 @@ class RssConverter(DocumentConverter):
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup)
return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
except BaseException as _:
return content
@@ -183,5 +186,6 @@ class RssConverter(DocumentConverter):
return None
fc = nodes[0].firstChild
if fc:
if hasattr(fc, "data"):
return fc.data
return None

View File

@@ -7,6 +7,12 @@ from .._exceptions import MissingDependencyException
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)
import speech_recognition as sr
import pydub
except ImportError:

View File

@@ -1,7 +1,7 @@
import io
import re
import bs4
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
) -> DocumentConverterResult:
# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks
for script in soup(["script", "style"]):
@@ -72,16 +72,15 @@ class WikipediaConverter(DocumentConverter):
if body_elm:
# What's the title
if title_elm and len(title_elm) > 0:
main_title = title_elm.string # type: ignore
assert isinstance(main_title, str)
if title_elm and isinstance(title_elm, bs4.Tag):
main_title = title_elm.string
# Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
body_elm
)
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
**kwargs
).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
return DocumentConverterResult(
markdown=webpage_text,

View File

@@ -86,7 +86,9 @@ class XlsxConverter(DocumentConverter):
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(html_content).markdown.strip()
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
@@ -146,7 +148,9 @@ class XlsConverter(DocumentConverter):
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(html_content).markdown.strip()
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)

View File

@@ -3,16 +3,21 @@ import json
import time
import io
import re
import bs4
from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support
try:
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=SyntaxWarning)
# Patch submitted upstream to fix the SyntaxWarning
from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
@@ -72,21 +77,31 @@ class YouTubeConverter(DocumentConverter):
) -> DocumentConverterResult:
# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Read the meta tags
metadata: Dict[str, str] = {"title": soup.title.string}
metadata: Dict[str, str] = {}
if soup.title and soup.title.string:
metadata["title"] = soup.title.string
for meta in soup(["meta"]):
if not isinstance(meta, bs4.Tag):
continue
for a in meta.attrs:
if a in ["itemprop", "property", "name"]:
content = meta.get("content", "")
if content: # Only add non-empty content
metadata[meta[a]] = content
key = str(meta.get(a, ""))
content = str(meta.get("content", ""))
if key and content: # Only add non-empty content
metadata[key] = content
break
# Try reading the description
try:
for script in soup(["script"]):
if not isinstance(script, bs4.Tag):
continue
if not script.string: # Skip empty scripts
continue
content = script.string
@@ -132,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
webpage_text += f"\n### Description\n{description}\n"
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
@@ -143,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
)
# Retry the transcript fetching operation
transcript = self._retry_operation(
lambda: YouTubeTranscriptApi.get_transcript(
lambda: ytt_api.fetch(
video_id, languages=youtube_transcript_languages
),
retries=3, # Retry 3 times
@@ -151,17 +167,14 @@ class YouTubeConverter(DocumentConverter):
)
if transcript:
transcript_text = " ".join(
[part["text"] for part in transcript]
[part.text for part in transcript]
) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
# formatter.format_transcript(transcript)
except Exception as e:
print(f"Error fetching transcript: {e}")
if transcript_text:
webpage_text += f"\n### Transcript\n{transcript_text}\n"
title = title if title else soup.title.string
title = title if title else (soup.title.string if soup.title else "")
assert isinstance(title, str)
return DocumentConverterResult(

View File

@@ -25,8 +25,11 @@ GENERAL_TEST_VECTORS = [
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"data:image/png;base64...",
],
must_not_include=[
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
must_not_include=[],
),
FileTestVector(
filename="test.xlsx",
@@ -65,8 +68,9 @@ GENERAL_TEST_VECTORS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
"![This phrase of the caption is Human-written.](Picture4.jpg)",
],
must_not_include=[],
must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
),
FileTestVector(
filename="test_outlook_msg.msg",
@@ -211,4 +215,64 @@ GENERAL_TEST_VECTORS = [
],
must_not_include=[],
),
FileTestVector(
filename="test.epub",
mimetype="application/epub+zip",
charset=None,
url=None,
must_include=[
"**Authors:** Test Author",
"A test EPUB document for MarkItDown testing",
"# Chapter 1: Test Content",
"This is a **test** paragraph with some formatting",
"* A bullet point",
"* Another point",
"# Chapter 2: More Content",
"*different* style",
"> This is a blockquote for testing",
],
must_not_include=[],
),
]
DATA_URI_TEST_VECTORS = [
FileTestVector(
filename="test.docx",
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
charset=None,
url=None,
must_include=[
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
must_not_include=[
"data:image/png;base64...",
],
),
FileTestVector(
filename="test.pptx",
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
charset=None,
url=None,
must_include=[
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
"![This phrase of the caption is Human-written.]", # image caption
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
],
must_not_include=[
"![This phrase of the caption is Human-written.](Picture4.jpg)",
],
),
]

View File

@@ -7,9 +7,17 @@ import locale
from typing import List
if __name__ == "__main__":
from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
from _test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
FileTestVector,
)
else:
from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
from ._test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
FileTestVector,
)
from markitdown import (
MarkItDown,
@@ -114,7 +122,9 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
)
stdout = result.stdout.decode(locale.getpreferredencoding())
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
assert (
result.returncode == 0
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
for test_string in test_vector.must_include:
assert test_string in stdout
for test_string in test_vector.must_not_include:
@@ -147,6 +157,39 @@ def test_convert_url(shared_tmp_dir, test_vector):
assert test_string not in stdout
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
"""Test CLI functionality when keep_data_uris is enabled"""
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
result = subprocess.run(
[
"python",
"-m",
"markitdown",
"--keep-data-uris",
"-o",
output_file,
os.path.join(TEST_FILES_DIR, test_vector.filename),
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
assert os.path.exists(output_file), f"Output file not created: {output_file}"
with open(output_file, "r") as f:
output_data = f.read()
for test_string in test_vector.must_include:
assert test_string in output_data
for test_string in test_vector.must_not_include:
assert test_string not in output_data
os.remove(output_file)
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
if __name__ == "__main__":
import sys
import tempfile
@@ -154,6 +197,7 @@ if __name__ == "__main__":
"""Runs this file's tests from the command line."""
with tempfile.TemporaryDirectory() as tmp_dir:
# General tests
for test_function in [
test_output_to_stdout,
test_output_to_file,
@@ -167,4 +211,17 @@ if __name__ == "__main__":
)
test_function(tmp_dir, test_vector)
print("OK")
# Data URI tests
for test_function in [
test_output_to_file_with_data_uris,
]:
for test_vector in DATA_URI_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...",
end="",
)
test_function(tmp_dir, test_vector)
print("OK")
print("All tests passed!")

BIN
packages/markitdown/tests/test_files/test.docx vendored Normal file → Executable file

Binary file not shown.

Binary file not shown.

View File

@@ -6,9 +6,9 @@ import codecs
if __name__ == "__main__":
from _test_vectors import GENERAL_TEST_VECTORS
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
else:
from ._test_vectors import GENERAL_TEST_VECTORS
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
from markitdown import (
MarkItDown,
@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
"test_outlook_msg.msg",
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
]:
return
@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()
# For some limited exceptions, we can't guarantee the exact
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
# This appears to be a subtle bug in magika.
# See: https://github.com/google/magika/issues/983
"test_mskanji.csv",
]:
return
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(stream, url=test_vector.url)
for string in test_vector.must_include:
@@ -134,10 +124,52 @@ def test_convert_url(test_vector):
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_with_data_uris(test_vector):
"""Test API functionality when keep_data_uris is enabled"""
markitdown = MarkItDown()
# Test local file conversion
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, test_vector.filename),
keep_data_uris=True,
url=test_vector.url,
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_stream_with_data_uris(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()
stream_info = StreamInfo(
extension=os.path.splitext(test_vector.filename)[1],
mimetype=test_vector.mimetype,
charset=test_vector.charset,
)
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(
stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
if __name__ == "__main__":
import sys
"""Runs this file's tests from the command line."""
# General tests
for test_function in [
test_guess_stream_info,
test_convert_local,
@@ -151,4 +183,17 @@ if __name__ == "__main__":
)
test_function(test_vector)
print("OK")
# Data URI tests
for test_function in [
test_convert_with_data_uris,
test_convert_stream_with_data_uris,
]:
for test_vector in DATA_URI_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
)
test_function(test_vector)
print("OK")
print("All tests passed!")