1 Commits

Author SHA1 Message Date
Adam Fourney
de2c56ffbc Bumping version to 0.1.0a2 2025-03-12 11:42:00 -07:00
24 changed files with 91 additions and 487 deletions

View File

@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports:
- PDF
- PowerPoint
- PowerPoint (reading in top-to-bottom, left-to-right order)
- Word
- Excel
- Images (EXIF metadata and OCR)
@@ -23,7 +23,6 @@ At present, MarkItDown supports:
- Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents)
- Youtube URLs
- EPubs
- ... and more!
## Why Markdown?

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4",
"requests",
"markdownify",
"magika~=0.6.1",
"magika>=0.6.0rc1",
"charset-normalizer",
]
@@ -42,7 +42,7 @@ all = [
"olefile",
"pydub",
"SpeechRecognition",
"youtube-transcript-api~=1.0.0",
"youtube-transcript-api",
"azure-ai-documentintelligence",
"azure-identity"
]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.0a6"
__version__ = "0.1.0a2"

View File

@@ -4,7 +4,6 @@
import argparse
import sys
import codecs
import locale
from textwrap import dedent
from importlib.metadata import entry_points
from .__about__ import __version__
@@ -105,12 +104,6 @@ def main():
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
)
parser.add_argument(
"--keep-data-uris",
action="store_true",
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
)
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
@@ -146,7 +139,7 @@ def main():
else:
charset_hint = None
stream_info = None
stream_info: str | None = None
if (
extension_hint is not None
or mime_type_hint is not None
@@ -188,15 +181,9 @@ def main():
markitdown = MarkItDown(enable_plugins=args.use_plugins)
if args.filename is None:
result = markitdown.convert_stream(
sys.stdin.buffer,
stream_info=stream_info,
keep_data_uris=args.keep_data_uris,
)
result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
else:
result = markitdown.convert(
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
)
result = markitdown.convert(args.filename, stream_info=stream_info)
_handle_output(args, result)
@@ -205,14 +192,9 @@ def _handle_output(args, result: DocumentConverterResult):
"""Handle output to stdout or file"""
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(result.markdown)
f.write(result.text_content)
else:
# Handle stdout encoding errors more gracefully
print(
result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
sys.stdout.encoding
)
)
print(result.text_content)
def _exit_with_error(message: str):

View File

@@ -38,7 +38,6 @@ from .converters import (
AudioConverter,
OutlookMsgConverter,
ZipConverter,
EpubConverter,
DocumentIntelligenceConverter,
)
@@ -192,7 +191,6 @@ class MarkItDown:
self.register_converter(IpynbConverter())
self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint")
@@ -612,16 +610,14 @@ class MarkItDown:
# Call magika to guess from the stream
cur_pos = file_stream.tell()
try:
result = self._magika.identify_stream(file_stream)
stream_bytes = file_stream.read()
result = self._magika.identify_bytes(stream_bytes)
if result.status == "ok" and result.prediction.output.label != "unknown":
# If it's text, also guess the charset
charset = None
if result.prediction.output.is_text:
# Read the first 4k to guess the charset
file_stream.seek(cur_pos)
stream_page = file_stream.read(4096)
charset_result = charset_normalizer.from_bytes(stream_page).best()
charset_result = charset_normalizer.from_bytes(stream_bytes).best()
if charset_result is not None:
charset = self._normalize_charset(charset_result.encoding)

View File

@@ -18,7 +18,6 @@ from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter
from ._epub_converter import EpubConverter
__all__ = [
"PlainTextConverter",
@@ -38,5 +37,4 @@ __all__ = [
"OutlookMsgConverter",
"ZipConverter",
"DocumentIntelligenceConverter",
"EpubConverter",
]

View File

@@ -1,7 +1,6 @@
import io
import re
import base64
import binascii
from urllib.parse import parse_qs, urlparse
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
@@ -61,8 +60,6 @@ class BingSerpConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
assert stream_info.url is not None
# Parse the query parameters
parsed_params = parse_qs(urlparse(stream_info.url).query)
query = parsed_params.get("q", [""])[0]
@@ -79,12 +76,9 @@ class BingSerpConverter(DocumentConverter):
slug.extract()
# Parse the algorithmic results
_markdownify = _CustomMarkdownify(**kwargs)
_markdownify = _CustomMarkdownify()
results = list()
for result in soup.find_all(class_="b_algo"):
if not hasattr(result, "find_all"):
continue
# Rewrite redirect urls
for a in result.find_all("a", href=True):
parsed_href = urlparse(a["href"])

View File

@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None)
return self._html_converter.convert_string(
mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
mammoth.convert_to_html(file_stream, style_map=style_map).value
)

View File

@@ -1,147 +0,0 @@
import os
import zipfile
import xml.dom.minidom as minidom
from typing import BinaryIO, Any, Dict, List
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/epub",
"application/epub+zip",
"application/x-epub+zip",
]
ACCEPTED_FILE_EXTENSIONS = [".epub"]
MIME_TYPE_MAPPING = {
".html": "text/html",
".xhtml": "application/xhtml+xml",
}
class EpubConverter(HtmlConverter):
"""
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
with zipfile.ZipFile(file_stream, "r") as z:
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
# Locate content.opf
container_dom = minidom.parse(z.open("META-INF/container.xml"))
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
"full-path"
)
# Parse content.opf
opf_dom = minidom.parse(z.open(opf_path))
metadata: Dict[str, Any] = {
"title": self._get_text_from_node(opf_dom, "dc:title"),
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
"language": self._get_text_from_node(opf_dom, "dc:language"),
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
"date": self._get_text_from_node(opf_dom, "dc:date"),
"description": self._get_text_from_node(opf_dom, "dc:description"),
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
}
# Extract manifest items (ID → href mapping)
manifest = {
item.getAttribute("id"): item.getAttribute("href")
for item in opf_dom.getElementsByTagName("item")
}
# Extract spine order (ID refs)
spine_items = opf_dom.getElementsByTagName("itemref")
spine_order = [item.getAttribute("idref") for item in spine_items]
# Convert spine order to actual file paths
base_path = "/".join(
opf_path.split("/")[:-1]
) # Get base directory of content.opf
spine = [
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
for item_id in spine_order
if item_id in manifest
]
# Extract and convert the content
markdown_content: List[str] = []
for file in spine:
if file in z.namelist():
with z.open(file) as f:
filename = os.path.basename(file)
extension = os.path.splitext(filename)[1].lower()
mimetype = MIME_TYPE_MAPPING.get(extension)
converted_content = self._html_converter.convert(
f,
StreamInfo(
mimetype=mimetype,
extension=extension,
filename=filename,
),
)
markdown_content.append(converted_content.markdown.strip())
# Format and add the metadata
metadata_markdown = []
for key, value in metadata.items():
if isinstance(value, list):
value = ", ".join(value)
if value:
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
markdown_content.insert(0, "\n".join(metadata_markdown))
return DocumentConverterResult(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
if len(texts) > 0:
return texts[0]
else:
return None
def _get_all_texts_from_nodes(
self, dom: minidom.Document, tag_name: str
) -> List[str]:
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
texts: List[str] = []
for node in dom.getElementsByTagName(tag_name):
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
texts.append(node.firstChild.nodeValue.strip())
return texts

View File

@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
webpage_text = _CustomMarkdownify().convert_soup(soup)
assert isinstance(webpage_text, str)

View File

@@ -17,7 +17,6 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
@@ -102,7 +101,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return alt
# Remove dataURIs
if src.startswith("data:") and not self.options["keep_data_uris"]:
if src.startswith("data:"):
src = src.split(",")[0] + "..."
return "![%s](%s%s)" % (alt, src, title_part)

View File

@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
_dependency_exc_info = None
olefile = None
try:
import olefile # type: ignore[no-redef]
import olefile
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
@@ -56,13 +56,12 @@ class OutlookMsgConverter(DocumentConverter):
# Brue force, check if it's an Outlook file
try:
if olefile is not None:
msg = olefile.OleFileIO(file_stream)
toc = "\n".join([str(stream) for stream in msg.listdir()])
return (
"__properties_version1.0" in toc
and "__recip_version1.0_#00000000" in toc
)
msg = olefile.OleFileIO(file_stream)
toc = "\n".join([str(stream) for stream in msg.listdir()])
return (
"__properties_version1.0" in toc
and "__recip_version1.0_#00000000" in toc
)
except Exception as e:
pass
finally:
@@ -90,11 +89,7 @@ class OutlookMsgConverter(DocumentConverter):
_dependency_exc_info[2]
)
assert (
olefile is not None
) # If we made it this far, olefile should be available
msg = olefile.OleFileIO(file_stream)
# Extract email metadata
md_content = "# Email Message\n\n"
@@ -126,7 +121,6 @@ class OutlookMsgConverter(DocumentConverter):
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file."""
assert olefile is not None
assert isinstance(
msg, olefile.OleFileIO
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)

View File

@@ -17,16 +17,12 @@ except ImportError:
ACCEPTED_MIME_TYPE_PREFIXES = [
"text/",
"application/json",
"application/markdown",
]
ACCEPTED_FILE_EXTENSIONS = [
".txt",
".text",
".md",
".markdown",
".json",
".jsonl",
# Mimetypes to ignore (commonly confused extensions)
IGNORE_MIME_TYPE_PREFIXES = [
"text/vnd.in3d.spot", # .spo wich is confused with xls, doc, etc.
"text/vnd.graphviz", # .dot which is confused with xls, doc, etc.
]
@@ -42,14 +38,9 @@ class PlainTextConverter(DocumentConverter):
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
# If we have a charset, we can safely assume it's text
# With Magika in the earlier stages, this handles most cases
if stream_info.charset is not None:
return True
# Otherwise, check the mimetype and extension
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in IGNORE_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return False
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):

View File

@@ -140,20 +140,13 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
alt_text = re.sub(r"\s+", " ", alt_text).strip()
# If keep_data_uris is True, use base64 encoding for images
if kwargs.get("keep_data_uris", False):
blob = shape.image.blob
content_type = shape.image.content_type or "image/png"
b64_string = base64.b64encode(blob).decode("utf-8")
md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
else:
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += "\n![" + alt_text + "](" + filename + ")\n"
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += "\n![" + alt_text + "](" + filename + ")\n"
# Tables
if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
md_content += self._convert_table_to_markdown(shape.table)
# Charts
if shape.has_chart:
@@ -200,7 +193,7 @@ class PptxConverter(DocumentConverter):
return True
return False
def _convert_table_to_markdown(self, table, **kwargs):
def _convert_table_to_markdown(self, table):
# Write the table as HTML, then convert it to Markdown
html_table = "<html><body><table>"
first_row = True
@@ -215,10 +208,7 @@ class PptxConverter(DocumentConverter):
first_row = False
html_table += "</table></body></html>"
return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+ "\n"
)
return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
def _convert_chart_to_markdown(self, chart):
try:

View File

@@ -28,10 +28,6 @@ CANDIDATE_FILE_EXTENSIONS = [
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def __init__(self):
super().__init__()
self._kwargs = {}
def accepts(
self,
file_stream: BinaryIO,
@@ -70,7 +66,7 @@ class RssConverter(DocumentConverter):
file_stream.seek(cur_pos)
return False
def _feed_type(self, doc: Any) -> str | None:
def _feed_type(self, doc: Any) -> str:
if doc.getElementsByTagName("rss"):
return "rss"
elif doc.getElementsByTagName("feed"):
@@ -86,7 +82,6 @@ class RssConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
self._kwargs = kwargs
doc = minidom.parse(file_stream)
feed_type = self._feed_type(doc)
@@ -135,10 +130,10 @@ class RssConverter(DocumentConverter):
Returns None if the feed type is not recognized or something goes wrong.
"""
root = doc.getElementsByTagName("rss")[0]
channel_list = root.getElementsByTagName("channel")
if not channel_list:
raise ValueError("No channel found in RSS feed")
channel = channel_list[0]
channel = root.getElementsByTagName("channel")
if not channel:
return None
channel = channel[0]
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
@@ -146,6 +141,8 @@ class RssConverter(DocumentConverter):
md_text = f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
if not items:
items = []
for item in items:
title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description")
@@ -171,7 +168,7 @@ class RssConverter(DocumentConverter):
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
return _CustomMarkdownify().convert_soup(soup)
except BaseException as _:
return content
@@ -186,6 +183,5 @@ class RssConverter(DocumentConverter):
return None
fc = nodes[0].firstChild
if fc:
if hasattr(fc, "data"):
return fc.data
return fc.data
return None

View File

@@ -7,14 +7,8 @@ from .._exceptions import MissingDependencyException
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)
import speech_recognition as sr
import pydub
import speech_recognition as sr
import pydub
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()

View File

@@ -1,7 +1,7 @@
import io
import re
import bs4
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
) -> DocumentConverterResult:
# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks
for script in soup(["script", "style"]):
@@ -72,15 +72,16 @@ class WikipediaConverter(DocumentConverter):
if body_elm:
# What's the title
if title_elm and isinstance(title_elm, bs4.Tag):
main_title = title_elm.string
if title_elm and len(title_elm) > 0:
main_title = title_elm.string # type: ignore
assert isinstance(main_title, str)
# Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
**kwargs
).convert_soup(body_elm)
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
body_elm
)
else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
webpage_text = _CustomMarkdownify().convert_soup(soup)
return DocumentConverterResult(
markdown=webpage_text,

View File

@@ -86,9 +86,7 @@ class XlsxConverter(DocumentConverter):
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
self._html_converter.convert_string(html_content).markdown.strip()
+ "\n\n"
)
@@ -148,9 +146,7 @@ class XlsConverter(DocumentConverter):
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
self._html_converter.convert_string(html_content).markdown.strip()
+ "\n\n"
)

View File

@@ -3,22 +3,17 @@ import json
import time
import io
import re
import bs4
from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support
try:
# Suppress some warnings on library import
import warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=SyntaxWarning)
# Patch submitted upstream to fix the SyntaxWarning
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
except ModuleNotFoundError:
@@ -77,31 +72,21 @@ class YouTubeConverter(DocumentConverter):
) -> DocumentConverterResult:
# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Read the meta tags
metadata: Dict[str, str] = {}
if soup.title and soup.title.string:
metadata["title"] = soup.title.string
metadata: Dict[str, str] = {"title": soup.title.string}
for meta in soup(["meta"]):
if not isinstance(meta, bs4.Tag):
continue
for a in meta.attrs:
if a in ["itemprop", "property", "name"]:
key = str(meta.get(a, ""))
content = str(meta.get("content", ""))
if key and content: # Only add non-empty content
metadata[key] = content
content = meta.get("content", "")
if content: # Only add non-empty content
metadata[meta[a]] = content
break
# Try reading the description
try:
for script in soup(["script"]):
if not isinstance(script, bs4.Tag):
continue
if not script.string: # Skip empty scripts
continue
content = script.string
@@ -147,7 +132,6 @@ class YouTubeConverter(DocumentConverter):
webpage_text += f"\n### Description\n{description}\n"
if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
ytt_api = YouTubeTranscriptApi()
transcript_text = ""
parsed_url = urlparse(stream_info.url) # type: ignore
params = parse_qs(parsed_url.query) # type: ignore
@@ -159,7 +143,7 @@ class YouTubeConverter(DocumentConverter):
)
# Retry the transcript fetching operation
transcript = self._retry_operation(
lambda: ytt_api.fetch(
lambda: YouTubeTranscriptApi.get_transcript(
video_id, languages=youtube_transcript_languages
),
retries=3, # Retry 3 times
@@ -167,14 +151,17 @@ class YouTubeConverter(DocumentConverter):
)
if transcript:
transcript_text = " ".join(
[part.text for part in transcript]
[part["text"] for part in transcript]
) # type: ignore
# Alternative formatting:
# formatter = TextFormatter()
# formatter.format_transcript(transcript)
except Exception as e:
print(f"Error fetching transcript: {e}")
if transcript_text:
webpage_text += f"\n### Transcript\n{transcript_text}\n"
title = title if title else (soup.title.string if soup.title else "")
title = title if title else soup.title.string
assert isinstance(title, str)
return DocumentConverterResult(

View File

@@ -25,11 +25,8 @@ GENERAL_TEST_VECTORS = [
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"data:image/png;base64...",
],
must_not_include=[
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
must_not_include=[],
),
FileTestVector(
filename="test.xlsx",
@@ -68,9 +65,8 @@ GENERAL_TEST_VECTORS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
"![This phrase of the caption is Human-written.](Picture4.jpg)",
],
must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
must_not_include=[],
),
FileTestVector(
filename="test_outlook_msg.msg",
@@ -215,64 +211,4 @@ GENERAL_TEST_VECTORS = [
],
must_not_include=[],
),
FileTestVector(
filename="test.epub",
mimetype="application/epub+zip",
charset=None,
url=None,
must_include=[
"**Authors:** Test Author",
"A test EPUB document for MarkItDown testing",
"# Chapter 1: Test Content",
"This is a **test** paragraph with some formatting",
"* A bullet point",
"* Another point",
"# Chapter 2: More Content",
"*different* style",
"> This is a blockquote for testing",
],
must_not_include=[],
),
]
DATA_URI_TEST_VECTORS = [
FileTestVector(
filename="test.docx",
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
charset=None,
url=None,
must_include=[
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"data:image/png;base64,iVBORw0KGgoAAAANSU",
],
must_not_include=[
"data:image/png;base64...",
],
),
FileTestVector(
filename="test.pptx",
mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
charset=None,
url=None,
must_include=[
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
"44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
"1b92870d-e3b5-4e65-8153-919f4ff45592",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
"![This phrase of the caption is Human-written.]", # image caption
"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
],
must_not_include=[
"![This phrase of the caption is Human-written.](Picture4.jpg)",
],
),
]

View File

@@ -7,17 +7,9 @@ import locale
from typing import List
if __name__ == "__main__":
from _test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
FileTestVector,
)
from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
else:
from ._test_vectors import (
GENERAL_TEST_VECTORS,
DATA_URI_TEST_VECTORS,
FileTestVector,
)
from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
from markitdown import (
MarkItDown,
@@ -122,9 +114,7 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
)
stdout = result.stdout.decode(locale.getpreferredencoding())
assert (
result.returncode == 0
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
for test_string in test_vector.must_include:
assert test_string in stdout
for test_string in test_vector.must_not_include:
@@ -157,39 +147,6 @@ def test_convert_url(shared_tmp_dir, test_vector):
assert test_string not in stdout
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
"""Test CLI functionality when keep_data_uris is enabled"""
output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
result = subprocess.run(
[
"python",
"-m",
"markitdown",
"--keep-data-uris",
"-o",
output_file,
os.path.join(TEST_FILES_DIR, test_vector.filename),
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
assert os.path.exists(output_file), f"Output file not created: {output_file}"
with open(output_file, "r") as f:
output_data = f.read()
for test_string in test_vector.must_include:
assert test_string in output_data
for test_string in test_vector.must_not_include:
assert test_string not in output_data
os.remove(output_file)
assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
if __name__ == "__main__":
import sys
import tempfile
@@ -197,7 +154,6 @@ if __name__ == "__main__":
"""Runs this file's tests from the command line."""
with tempfile.TemporaryDirectory() as tmp_dir:
# General tests
for test_function in [
test_output_to_stdout,
test_output_to_file,
@@ -211,17 +167,4 @@ if __name__ == "__main__":
)
test_function(tmp_dir, test_vector)
print("OK")
# Data URI tests
for test_function in [
test_output_to_file_with_data_uris,
]:
for test_vector in DATA_URI_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...",
end="",
)
test_function(tmp_dir, test_vector)
print("OK")
print("All tests passed!")

BIN
packages/markitdown/tests/test_files/test.docx vendored Executable file → Normal file

Binary file not shown.

Binary file not shown.

View File

@@ -6,9 +6,9 @@ import codecs
if __name__ == "__main__":
from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
from _test_vectors import GENERAL_TEST_VECTORS
else:
from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
from ._test_vectors import GENERAL_TEST_VECTORS
from markitdown import (
MarkItDown,
@@ -47,6 +47,7 @@ def test_guess_stream_info(test_vector):
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
"test_outlook_msg.msg",
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
]:
return
@@ -95,6 +96,15 @@ def test_convert_stream_without_hints(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()
# For some limited exceptions, we can't guarantee the exact
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
# This appears to be a subtle bug in magika.
# See: https://github.com/google/magika/issues/983
"test_mskanji.csv",
]:
return
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(stream, url=test_vector.url)
for string in test_vector.must_include:
@@ -124,52 +134,10 @@ def test_convert_url(test_vector):
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_with_data_uris(test_vector):
"""Test API functionality when keep_data_uris is enabled"""
markitdown = MarkItDown()
# Test local file conversion
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, test_vector.filename),
keep_data_uris=True,
url=test_vector.url,
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
def test_convert_stream_with_data_uris(test_vector):
"""Test the conversion of a stream with no stream info."""
markitdown = MarkItDown()
stream_info = StreamInfo(
extension=os.path.splitext(test_vector.filename)[1],
mimetype=test_vector.mimetype,
charset=test_vector.charset,
)
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(
stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
)
for string in test_vector.must_include:
assert string in result.markdown
for string in test_vector.must_not_include:
assert string not in result.markdown
if __name__ == "__main__":
import sys
"""Runs this file's tests from the command line."""
# General tests
for test_function in [
test_guess_stream_info,
test_convert_local,
@@ -183,17 +151,4 @@ if __name__ == "__main__":
)
test_function(test_vector)
print("OK")
# Data URI tests
for test_function in [
test_convert_with_data_uris,
test_convert_stream_with_data_uris,
]:
for test_vector in DATA_URI_TEST_VECTORS:
print(
f"Running {test_function.__name__} on {test_vector.filename}...", end=""
)
test_function(test_vector)
print("OK")
print("All tests passed!")