* Adapted #123 to not use epublib. * Updated README.md
This commit is contained in:
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
|
|||||||
At present, MarkItDown supports:
|
At present, MarkItDown supports:
|
||||||
|
|
||||||
- PDF
|
- PDF
|
||||||
- PowerPoint (reading in top-to-bottom, left-to-right order)
|
- PowerPoint
|
||||||
- Word
|
- Word
|
||||||
- Excel
|
- Excel
|
||||||
- Images (EXIF metadata and OCR)
|
- Images (EXIF metadata and OCR)
|
||||||
@@ -23,6 +23,7 @@ At present, MarkItDown supports:
|
|||||||
- Text-based formats (CSV, JSON, XML)
|
- Text-based formats (CSV, JSON, XML)
|
||||||
- ZIP files (iterates over contents)
|
- ZIP files (iterates over contents)
|
||||||
- Youtube URLs
|
- Youtube URLs
|
||||||
|
- EPubs
|
||||||
- ... and more!
|
- ... and more!
|
||||||
|
|
||||||
## Why Markdown?
|
## Why Markdown?
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ from .converters import (
|
|||||||
AudioConverter,
|
AudioConverter,
|
||||||
OutlookMsgConverter,
|
OutlookMsgConverter,
|
||||||
ZipConverter,
|
ZipConverter,
|
||||||
|
EpubConverter,
|
||||||
DocumentIntelligenceConverter,
|
DocumentIntelligenceConverter,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -191,6 +192,7 @@ class MarkItDown:
|
|||||||
self.register_converter(IpynbConverter())
|
self.register_converter(IpynbConverter())
|
||||||
self.register_converter(PdfConverter())
|
self.register_converter(PdfConverter())
|
||||||
self.register_converter(OutlookMsgConverter())
|
self.register_converter(OutlookMsgConverter())
|
||||||
|
self.register_converter(EpubConverter())
|
||||||
|
|
||||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from ._audio_converter import AudioConverter
|
|||||||
from ._outlook_msg_converter import OutlookMsgConverter
|
from ._outlook_msg_converter import OutlookMsgConverter
|
||||||
from ._zip_converter import ZipConverter
|
from ._zip_converter import ZipConverter
|
||||||
from ._doc_intel_converter import DocumentIntelligenceConverter
|
from ._doc_intel_converter import DocumentIntelligenceConverter
|
||||||
|
from ._epub_converter import EpubConverter
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"PlainTextConverter",
|
"PlainTextConverter",
|
||||||
@@ -37,4 +38,5 @@ __all__ = [
|
|||||||
"OutlookMsgConverter",
|
"OutlookMsgConverter",
|
||||||
"ZipConverter",
|
"ZipConverter",
|
||||||
"DocumentIntelligenceConverter",
|
"DocumentIntelligenceConverter",
|
||||||
|
"EpubConverter",
|
||||||
]
|
]
|
||||||
|
|||||||
147
packages/markitdown/src/markitdown/converters/_epub_converter.py
Normal file
147
packages/markitdown/src/markitdown/converters/_epub_converter.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
import xml.dom.minidom as minidom
|
||||||
|
|
||||||
|
from typing import BinaryIO, Any, Dict, List
|
||||||
|
|
||||||
|
from ._html_converter import HtmlConverter
|
||||||
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
|
from .._stream_info import StreamInfo
|
||||||
|
|
||||||
|
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||||
|
"application/epub",
|
||||||
|
"application/epub+zip",
|
||||||
|
"application/x-epub+zip",
|
||||||
|
]
|
||||||
|
|
||||||
|
ACCEPTED_FILE_EXTENSIONS = [".epub"]
|
||||||
|
|
||||||
|
MIME_TYPE_MAPPING = {
|
||||||
|
".html": "text/html",
|
||||||
|
".xhtml": "application/xhtml+xml",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class EpubConverter(HtmlConverter):
|
||||||
|
"""
|
||||||
|
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self._html_converter = HtmlConverter()
|
||||||
|
|
||||||
|
def accepts(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> bool:
|
||||||
|
mimetype = (stream_info.mimetype or "").lower()
|
||||||
|
extension = (stream_info.extension or "").lower()
|
||||||
|
|
||||||
|
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||||
|
if mimetype.startswith(prefix):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_stream: BinaryIO,
|
||||||
|
stream_info: StreamInfo,
|
||||||
|
**kwargs: Any, # Options to pass to the converter
|
||||||
|
) -> DocumentConverterResult:
|
||||||
|
with zipfile.ZipFile(file_stream, "r") as z:
|
||||||
|
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
|
||||||
|
|
||||||
|
# Locate content.opf
|
||||||
|
container_dom = minidom.parse(z.open("META-INF/container.xml"))
|
||||||
|
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
|
||||||
|
"full-path"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse content.opf
|
||||||
|
opf_dom = minidom.parse(z.open(opf_path))
|
||||||
|
metadata: Dict[str, Any] = {
|
||||||
|
"title": self._get_text_from_node(opf_dom, "dc:title"),
|
||||||
|
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
|
||||||
|
"language": self._get_text_from_node(opf_dom, "dc:language"),
|
||||||
|
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
|
||||||
|
"date": self._get_text_from_node(opf_dom, "dc:date"),
|
||||||
|
"description": self._get_text_from_node(opf_dom, "dc:description"),
|
||||||
|
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract manifest items (ID → href mapping)
|
||||||
|
manifest = {
|
||||||
|
item.getAttribute("id"): item.getAttribute("href")
|
||||||
|
for item in opf_dom.getElementsByTagName("item")
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract spine order (ID refs)
|
||||||
|
spine_items = opf_dom.getElementsByTagName("itemref")
|
||||||
|
spine_order = [item.getAttribute("idref") for item in spine_items]
|
||||||
|
|
||||||
|
# Convert spine order to actual file paths
|
||||||
|
base_path = "/".join(
|
||||||
|
opf_path.split("/")[:-1]
|
||||||
|
) # Get base directory of content.opf
|
||||||
|
spine = [
|
||||||
|
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
|
||||||
|
for item_id in spine_order
|
||||||
|
if item_id in manifest
|
||||||
|
]
|
||||||
|
|
||||||
|
# Extract and convert the content
|
||||||
|
markdown_content: List[str] = []
|
||||||
|
for file in spine:
|
||||||
|
if file in z.namelist():
|
||||||
|
with z.open(file) as f:
|
||||||
|
filename = os.path.basename(file)
|
||||||
|
extension = os.path.splitext(filename)[1].lower()
|
||||||
|
mimetype = MIME_TYPE_MAPPING.get(extension)
|
||||||
|
converted_content = self._html_converter.convert(
|
||||||
|
f,
|
||||||
|
StreamInfo(
|
||||||
|
mimetype=mimetype,
|
||||||
|
extension=extension,
|
||||||
|
filename=filename,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
markdown_content.append(converted_content.markdown.strip())
|
||||||
|
|
||||||
|
# Format and add the metadata
|
||||||
|
metadata_markdown = []
|
||||||
|
for key, value in metadata.items():
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = ", ".join(value)
|
||||||
|
if value:
|
||||||
|
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
|
||||||
|
|
||||||
|
markdown_content.insert(0, "\n".join(metadata_markdown))
|
||||||
|
|
||||||
|
return DocumentConverterResult(
|
||||||
|
markdown="\n\n".join(markdown_content), title=metadata["title"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
|
||||||
|
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
|
||||||
|
texts = self._get_all_texts_from_nodes(dom, tag_name)
|
||||||
|
if len(texts) > 0:
|
||||||
|
return texts[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_all_texts_from_nodes(
|
||||||
|
self, dom: minidom.Document, tag_name: str
|
||||||
|
) -> List[str]:
|
||||||
|
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
|
||||||
|
texts: List[str] = []
|
||||||
|
for node in dom.getElementsByTagName(tag_name):
|
||||||
|
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
|
||||||
|
texts.append(node.firstChild.nodeValue.strip())
|
||||||
|
return texts
|
||||||
@@ -211,4 +211,22 @@ GENERAL_TEST_VECTORS = [
|
|||||||
],
|
],
|
||||||
must_not_include=[],
|
must_not_include=[],
|
||||||
),
|
),
|
||||||
|
FileTestVector(
|
||||||
|
filename="test.epub",
|
||||||
|
mimetype="application/epub+zip",
|
||||||
|
charset=None,
|
||||||
|
url=None,
|
||||||
|
must_include=[
|
||||||
|
"**Authors:** Test Author",
|
||||||
|
"A test EPUB document for MarkItDown testing",
|
||||||
|
"# Chapter 1: Test Content",
|
||||||
|
"This is a **test** paragraph with some formatting",
|
||||||
|
"* A bullet point",
|
||||||
|
"* Another point",
|
||||||
|
"# Chapter 2: More Content",
|
||||||
|
"*different* style",
|
||||||
|
"> This is a blockquote for testing",
|
||||||
|
],
|
||||||
|
must_not_include=[],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user