6 Commits

Author SHA1 Message Date
afourney
a93e0567e6 EPub Support. Adapted #123 to not use epublib. (#1131)
* Adapted #123 to not use epublib.
* Updated README.md
2025-03-17 07:48:15 -07:00
afourney
c5f70b904f Have magika read from the stream. (#1136) 2025-03-17 07:39:19 -07:00
afourney
53834fdd24 Investigate and silence warnings. (#1133) 2025-03-15 23:41:35 -07:00
afourney
5c565b7d79 Fix remaining mypy errors. (#1132) 2025-03-15 23:12:48 -07:00
afourney
a78857bd43 Added epub test file. (#1130) 2025-03-15 18:34:51 -07:00
afourney
09df7fe8df Small fixes for autogen integration. (#1124) 2025-03-12 19:18:11 -07:00
17 changed files with 248 additions and 46 deletions

View File

@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports: At present, MarkItDown supports:
- PDF - PDF
- PowerPoint (reading in top-to-bottom, left-to-right order) - PowerPoint
- Word - Word
- Excel - Excel
- Images (EXIF metadata and OCR) - Images (EXIF metadata and OCR)
@@ -23,6 +23,7 @@ At present, MarkItDown supports:
- Text-based formats (CSV, JSON, XML) - Text-based formats (CSV, JSON, XML)
- ZIP files (iterates over contents) - ZIP files (iterates over contents)
- Youtube URLs - Youtube URLs
- EPubs
- ... and more! - ... and more!
## Why Markdown? ## Why Markdown?

View File

@@ -27,7 +27,7 @@ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"markdownify", "markdownify",
"magika>=0.6.1rc2", "magika>=0.6.1rc3",
"charset-normalizer", "charset-normalizer",
] ]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.1.0a2" __version__ = "0.1.0a4"

View File

@@ -139,7 +139,7 @@ def main():
else: else:
charset_hint = None charset_hint = None
stream_info: str | None = None stream_info = None
if ( if (
extension_hint is not None extension_hint is not None
or mime_type_hint is not None or mime_type_hint is not None

View File

@@ -38,6 +38,7 @@ from .converters import (
AudioConverter, AudioConverter,
OutlookMsgConverter, OutlookMsgConverter,
ZipConverter, ZipConverter,
EpubConverter,
DocumentIntelligenceConverter, DocumentIntelligenceConverter,
) )
@@ -191,6 +192,7 @@ class MarkItDown:
self.register_converter(IpynbConverter()) self.register_converter(IpynbConverter())
self.register_converter(PdfConverter()) self.register_converter(PdfConverter())
self.register_converter(OutlookMsgConverter()) self.register_converter(OutlookMsgConverter())
self.register_converter(EpubConverter())
# Register Document Intelligence converter at the top of the stack if endpoint is provided # Register Document Intelligence converter at the top of the stack if endpoint is provided
docintel_endpoint = kwargs.get("docintel_endpoint") docintel_endpoint = kwargs.get("docintel_endpoint")
@@ -610,14 +612,16 @@ class MarkItDown:
# Call magika to guess from the stream # Call magika to guess from the stream
cur_pos = file_stream.tell() cur_pos = file_stream.tell()
try: try:
stream_bytes = file_stream.read() result = self._magika.identify_stream(file_stream)
result = self._magika.identify_bytes(stream_bytes)
if result.status == "ok" and result.prediction.output.label != "unknown": if result.status == "ok" and result.prediction.output.label != "unknown":
# If it's text, also guess the charset # If it's text, also guess the charset
charset = None charset = None
if result.prediction.output.is_text: if result.prediction.output.is_text:
charset_result = charset_normalizer.from_bytes(stream_bytes).best() # Read the first 4k to guess the charset
file_stream.seek(cur_pos)
stream_page = file_stream.read(4096)
charset_result = charset_normalizer.from_bytes(stream_page).best()
if charset_result is not None: if charset_result is not None:
charset = self._normalize_charset(charset_result.encoding) charset = self._normalize_charset(charset_result.encoding)

View File

@@ -18,6 +18,7 @@ from ._audio_converter import AudioConverter
from ._outlook_msg_converter import OutlookMsgConverter from ._outlook_msg_converter import OutlookMsgConverter
from ._zip_converter import ZipConverter from ._zip_converter import ZipConverter
from ._doc_intel_converter import DocumentIntelligenceConverter from ._doc_intel_converter import DocumentIntelligenceConverter
from ._epub_converter import EpubConverter
__all__ = [ __all__ = [
"PlainTextConverter", "PlainTextConverter",
@@ -37,4 +38,5 @@ __all__ = [
"OutlookMsgConverter", "OutlookMsgConverter",
"ZipConverter", "ZipConverter",
"DocumentIntelligenceConverter", "DocumentIntelligenceConverter",
"EpubConverter",
] ]

View File

@@ -1,6 +1,7 @@
import io import io
import re import re
import base64 import base64
import binascii
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from typing import Any, BinaryIO, Optional from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -60,6 +61,8 @@ class BingSerpConverter(DocumentConverter):
stream_info: StreamInfo, stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter **kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult: ) -> DocumentConverterResult:
assert stream_info.url is not None
# Parse the query parameters # Parse the query parameters
parsed_params = parse_qs(urlparse(stream_info.url).query) parsed_params = parse_qs(urlparse(stream_info.url).query)
query = parsed_params.get("q", [""])[0] query = parsed_params.get("q", [""])[0]
@@ -79,6 +82,9 @@ class BingSerpConverter(DocumentConverter):
_markdownify = _CustomMarkdownify() _markdownify = _CustomMarkdownify()
results = list() results = list()
for result in soup.find_all(class_="b_algo"): for result in soup.find_all(class_="b_algo"):
if not hasattr(result, "find_all"):
continue
# Rewrite redirect urls # Rewrite redirect urls
for a in result.find_all("a", href=True): for a in result.find_all("a", href=True):
parsed_href = urlparse(a["href"]) parsed_href = urlparse(a["href"])

View File

@@ -0,0 +1,147 @@
import os
import zipfile
import xml.dom.minidom as minidom
from typing import BinaryIO, Any, Dict, List
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
ACCEPTED_MIME_TYPE_PREFIXES = [
"application/epub",
"application/epub+zip",
"application/x-epub+zip",
]
ACCEPTED_FILE_EXTENSIONS = [".epub"]
MIME_TYPE_MAPPING = {
".html": "text/html",
".xhtml": "application/xhtml+xml",
}
class EpubConverter(HtmlConverter):
"""
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
def __init__(self):
super().__init__()
self._html_converter = HtmlConverter()
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
with zipfile.ZipFile(file_stream, "r") as z:
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
# Locate content.opf
container_dom = minidom.parse(z.open("META-INF/container.xml"))
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
"full-path"
)
# Parse content.opf
opf_dom = minidom.parse(z.open(opf_path))
metadata: Dict[str, Any] = {
"title": self._get_text_from_node(opf_dom, "dc:title"),
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
"language": self._get_text_from_node(opf_dom, "dc:language"),
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
"date": self._get_text_from_node(opf_dom, "dc:date"),
"description": self._get_text_from_node(opf_dom, "dc:description"),
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
}
# Extract manifest items (ID → href mapping)
manifest = {
item.getAttribute("id"): item.getAttribute("href")
for item in opf_dom.getElementsByTagName("item")
}
# Extract spine order (ID refs)
spine_items = opf_dom.getElementsByTagName("itemref")
spine_order = [item.getAttribute("idref") for item in spine_items]
# Convert spine order to actual file paths
base_path = "/".join(
opf_path.split("/")[:-1]
) # Get base directory of content.opf
spine = [
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
for item_id in spine_order
if item_id in manifest
]
# Extract and convert the content
markdown_content: List[str] = []
for file in spine:
if file in z.namelist():
with z.open(file) as f:
filename = os.path.basename(file)
extension = os.path.splitext(filename)[1].lower()
mimetype = MIME_TYPE_MAPPING.get(extension)
converted_content = self._html_converter.convert(
f,
StreamInfo(
mimetype=mimetype,
extension=extension,
filename=filename,
),
)
markdown_content.append(converted_content.markdown.strip())
# Format and add the metadata
metadata_markdown = []
for key, value in metadata.items():
if isinstance(value, list):
value = ", ".join(value)
if value:
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
markdown_content.insert(0, "\n".join(metadata_markdown))
return DocumentConverterResult(
markdown="\n\n".join(markdown_content), title=metadata["title"]
)
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
if len(texts) > 0:
return texts[0]
else:
return None
def _get_all_texts_from_nodes(
self, dom: minidom.Document, tag_name: str
) -> List[str]:
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
texts: List[str] = []
for node in dom.getElementsByTagName(tag_name):
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
texts.append(node.firstChild.nodeValue.strip())
return texts

View File

@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
_dependency_exc_info = None _dependency_exc_info = None
olefile = None olefile = None
try: try:
import olefile import olefile # type: ignore[no-redef]
except ImportError: except ImportError:
# Preserve the error and stack trace for later # Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info() _dependency_exc_info = sys.exc_info()
@@ -56,12 +56,13 @@ class OutlookMsgConverter(DocumentConverter):
# Brue force, check if it's an Outlook file # Brue force, check if it's an Outlook file
try: try:
msg = olefile.OleFileIO(file_stream) if olefile is not None:
toc = "\n".join([str(stream) for stream in msg.listdir()]) msg = olefile.OleFileIO(file_stream)
return ( toc = "\n".join([str(stream) for stream in msg.listdir()])
"__properties_version1.0" in toc return (
and "__recip_version1.0_#00000000" in toc "__properties_version1.0" in toc
) and "__recip_version1.0_#00000000" in toc
)
except Exception as e: except Exception as e:
pass pass
finally: finally:
@@ -89,7 +90,11 @@ class OutlookMsgConverter(DocumentConverter):
_dependency_exc_info[2] _dependency_exc_info[2]
) )
assert (
olefile is not None
) # If we made it this far, olefile should be available
msg = olefile.OleFileIO(file_stream) msg = olefile.OleFileIO(file_stream)
# Extract email metadata # Extract email metadata
md_content = "# Email Message\n\n" md_content = "# Email Message\n\n"
@@ -121,6 +126,7 @@ class OutlookMsgConverter(DocumentConverter):
def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]: def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
"""Helper to safely extract and decode stream data from the MSG file.""" """Helper to safely extract and decode stream data from the MSG file."""
assert olefile is not None
assert isinstance( assert isinstance(
msg, olefile.OleFileIO msg, olefile.OleFileIO
) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package) ) # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)

View File

@@ -66,7 +66,7 @@ class RssConverter(DocumentConverter):
file_stream.seek(cur_pos) file_stream.seek(cur_pos)
return False return False
def _feed_type(self, doc: Any) -> str: def _feed_type(self, doc: Any) -> str | None:
if doc.getElementsByTagName("rss"): if doc.getElementsByTagName("rss"):
return "rss" return "rss"
elif doc.getElementsByTagName("feed"): elif doc.getElementsByTagName("feed"):
@@ -130,10 +130,10 @@ class RssConverter(DocumentConverter):
Returns None if the feed type is not recognized or something goes wrong. Returns None if the feed type is not recognized or something goes wrong.
""" """
root = doc.getElementsByTagName("rss")[0] root = doc.getElementsByTagName("rss")[0]
channel = root.getElementsByTagName("channel") channel_list = root.getElementsByTagName("channel")
if not channel: if not channel_list:
return None raise ValueError("No channel found in RSS feed")
channel = channel[0] channel = channel_list[0]
channel_title = self._get_data_by_tag_name(channel, "title") channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description") channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item") items = channel.getElementsByTagName("item")
@@ -141,8 +141,6 @@ class RssConverter(DocumentConverter):
md_text = f"# {channel_title}\n" md_text = f"# {channel_title}\n"
if channel_description: if channel_description:
md_text += f"{channel_description}\n" md_text += f"{channel_description}\n"
if not items:
items = []
for item in items: for item in items:
title = self._get_data_by_tag_name(item, "title") title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description") description = self._get_data_by_tag_name(item, "description")
@@ -183,5 +181,6 @@ class RssConverter(DocumentConverter):
return None return None
fc = nodes[0].firstChild fc = nodes[0].firstChild
if fc: if fc:
return fc.data if hasattr(fc, "data"):
return fc.data
return None return None

View File

@@ -7,7 +7,19 @@ from .._exceptions import MissingDependencyException
# Save reporting of any exceptions for later # Save reporting of any exceptions for later
_dependency_exc_info = None _dependency_exc_info = None
try: try:
# Suppress some deprecation warnings from the speech_recognition library
import warnings
warnings.filterwarnings(
"ignore", category=DeprecationWarning, module="speech_recognition"
)
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="pydub", # TODO: Migrate away from pydub
)
import speech_recognition as sr import speech_recognition as sr
import pydub import pydub
except ImportError: except ImportError:
# Preserve the error and stack trace for later # Preserve the error and stack trace for later

View File

@@ -1,7 +1,7 @@
import io import io
import re import re
import bs4
from typing import Any, BinaryIO, Optional from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Parse the stream # Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Remove javascript and style blocks # Remove javascript and style blocks
for script in soup(["script", "style"]): for script in soup(["script", "style"]):
@@ -72,9 +72,8 @@ class WikipediaConverter(DocumentConverter):
if body_elm: if body_elm:
# What's the title # What's the title
if title_elm and len(title_elm) > 0: if title_elm and isinstance(title_elm, bs4.Tag):
main_title = title_elm.string # type: ignore main_title = title_elm.string
assert isinstance(main_title, str)
# Convert the page # Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup( webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(

View File

@@ -3,9 +3,10 @@ import json
import time import time
import io import io
import re import re
import bs4
import warnings
from typing import Any, BinaryIO, Optional, Dict, List, Union from typing import Any, BinaryIO, Optional, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote from urllib.parse import parse_qs, urlparse, unquote
from bs4 import BeautifulSoup
from .._base_converter import DocumentConverter, DocumentConverterResult from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo from .._stream_info import StreamInfo
@@ -13,6 +14,11 @@ from ._markdownify import _CustomMarkdownify
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
warnings.filterwarnings(
"ignore",
category=SyntaxWarning,
module="youtube_transcript_api", # Patch submitted to youtube-transcript-api
)
from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api import YouTubeTranscriptApi
IS_YOUTUBE_TRANSCRIPT_CAPABLE = True IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
@@ -72,21 +78,31 @@ class YouTubeConverter(DocumentConverter):
) -> DocumentConverterResult: ) -> DocumentConverterResult:
# Parse the stream # Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
# Read the meta tags # Read the meta tags
metadata: Dict[str, str] = {"title": soup.title.string} metadata: Dict[str, str] = {}
if soup.title and soup.title.string:
metadata["title"] = soup.title.string
for meta in soup(["meta"]): for meta in soup(["meta"]):
if not isinstance(meta, bs4.Tag):
continue
for a in meta.attrs: for a in meta.attrs:
if a in ["itemprop", "property", "name"]: if a in ["itemprop", "property", "name"]:
content = meta.get("content", "") key = str(meta.get(a, ""))
if content: # Only add non-empty content content = str(meta.get("content", ""))
metadata[meta[a]] = content if key and content: # Only add non-empty content
metadata[key] = content
break break
# Try reading the description # Try reading the description
try: try:
for script in soup(["script"]): for script in soup(["script"]):
if not isinstance(script, bs4.Tag):
continue
if not script.string: # Skip empty scripts if not script.string: # Skip empty scripts
continue continue
content = script.string content = script.string
@@ -161,7 +177,7 @@ class YouTubeConverter(DocumentConverter):
if transcript_text: if transcript_text:
webpage_text += f"\n### Transcript\n{transcript_text}\n" webpage_text += f"\n### Transcript\n{transcript_text}\n"
title = title if title else soup.title.string title = title if title else (soup.title.string if soup.title else "")
assert isinstance(title, str) assert isinstance(title, str)
return DocumentConverterResult( return DocumentConverterResult(

View File

@@ -211,4 +211,22 @@ GENERAL_TEST_VECTORS = [
], ],
must_not_include=[], must_not_include=[],
), ),
FileTestVector(
filename="test.epub",
mimetype="application/epub+zip",
charset=None,
url=None,
must_include=[
"**Authors:** Test Author",
"A test EPUB document for MarkItDown testing",
"# Chapter 1: Test Content",
"This is a **test** paragraph with some formatting",
"* A bullet point",
"* Another point",
"# Chapter 2: More Content",
"*different* style",
"> This is a blockquote for testing",
],
must_not_include=[],
),
] ]

View File

@@ -114,7 +114,9 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
) )
stdout = result.stdout.decode(locale.getpreferredencoding()) stdout = result.stdout.decode(locale.getpreferredencoding())
assert result.returncode == 0, f"CLI exited with error: {result.stderr}" assert (
result.returncode == 0
), f"CLI exited with error: {result.stderr.decode('utf-8')}"
for test_string in test_vector.must_include: for test_string in test_vector.must_include:
assert test_string in stdout assert test_string in stdout
for test_string in test_vector.must_not_include: for test_string in test_vector.must_not_include:

Binary file not shown.

View File

@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
# mimetype or extension, so we'll special-case them here. # mimetype or extension, so we'll special-case them here.
if test_vector.filename in [ if test_vector.filename in [
"test_outlook_msg.msg", "test_outlook_msg.msg",
"test_mskanji.csv", # See: https://github.com/google/magika/issues/983
]: ]:
return return
@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
"""Test the conversion of a stream with no stream info.""" """Test the conversion of a stream with no stream info."""
markitdown = MarkItDown() markitdown = MarkItDown()
# For some limited exceptions, we can't guarantee the exact
# mimetype or extension, so we'll special-case them here.
if test_vector.filename in [
# This appears to be a subtle bug in magika.
# See: https://github.com/google/magika/issues/983
"test_mskanji.csv",
]:
return
with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
result = markitdown.convert(stream, url=test_vector.url) result = markitdown.convert(stream, url=test_vector.url)
for string in test_vector.must_include: for string in test_vector.must_include: