From a93e0567e6f114fea6642e7603d9274fac9920d9 Mon Sep 17 00:00:00 2001 From: afourney Date: Mon, 17 Mar 2025 07:48:15 -0700 Subject: [PATCH] EPub Support. Adapted #123 to not use epublib. (#1131) * Adapted #123 to not use epublib. * Updated README.md --- README.md | 3 +- .../markitdown/src/markitdown/_markitdown.py | 2 + .../src/markitdown/converters/__init__.py | 2 + .../markitdown/converters/_epub_converter.py | 147 ++++++++++++++++++ packages/markitdown/tests/_test_vectors.py | 18 +++ 5 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 packages/markitdown/src/markitdown/converters/_epub_converter.py diff --git a/README.md b/README.md index 40f4b82..4401a0d 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd At present, MarkItDown supports: - PDF -- PowerPoint (reading in top-to-bottom, left-to-right order) +- PowerPoint - Word - Excel - Images (EXIF metadata and OCR) @@ -23,6 +23,7 @@ At present, MarkItDown supports: - Text-based formats (CSV, JSON, XML) - ZIP files (iterates over contents) - Youtube URLs +- EPubs - ... and more! ## Why Markdown? diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index 78319eb..a8f7c9e 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -38,6 +38,7 @@ from .converters import ( AudioConverter, OutlookMsgConverter, ZipConverter, + EpubConverter, DocumentIntelligenceConverter, ) @@ -191,6 +192,7 @@ class MarkItDown: self.register_converter(IpynbConverter()) self.register_converter(PdfConverter()) self.register_converter(OutlookMsgConverter()) + self.register_converter(EpubConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index f43efe3..09e3cb1 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -18,6 +18,7 @@ from ._audio_converter import AudioConverter from ._outlook_msg_converter import OutlookMsgConverter from ._zip_converter import ZipConverter from ._doc_intel_converter import DocumentIntelligenceConverter +from ._epub_converter import EpubConverter __all__ = [ "PlainTextConverter", @@ -37,4 +38,5 @@ __all__ = [ "OutlookMsgConverter", "ZipConverter", "DocumentIntelligenceConverter", + "EpubConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py new file mode 100644 index 0000000..17d6d29 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py @@ -0,0 +1,147 @@ +import os +import zipfile +import xml.dom.minidom as minidom + +from typing import BinaryIO, Any, Dict, List + +from ._html_converter import HtmlConverter +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + +ACCEPTED_MIME_TYPE_PREFIXES = [ + "application/epub", + "application/epub+zip", + "application/x-epub+zip", +] + +ACCEPTED_FILE_EXTENSIONS = [".epub"] + +MIME_TYPE_MAPPING = { + ".html": "text/html", + ".xhtml": "application/xhtml+xml", +} + + +class EpubConverter(HtmlConverter): + """ + Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. + """ + + def __init__(self): + super().__init__() + self._html_converter = HtmlConverter() + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: + if mimetype.startswith(prefix): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, # Options to pass to the converter + ) -> DocumentConverterResult: + with zipfile.ZipFile(file_stream, "r") as z: + # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file.""" + + # Locate content.opf + container_dom = minidom.parse(z.open("META-INF/container.xml")) + opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute( + "full-path" + ) + + # Parse content.opf + opf_dom = minidom.parse(z.open(opf_path)) + metadata: Dict[str, Any] = { + "title": self._get_text_from_node(opf_dom, "dc:title"), + "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"), + "language": self._get_text_from_node(opf_dom, "dc:language"), + "publisher": self._get_text_from_node(opf_dom, "dc:publisher"), + "date": self._get_text_from_node(opf_dom, "dc:date"), + "description": self._get_text_from_node(opf_dom, "dc:description"), + "identifier": self._get_text_from_node(opf_dom, "dc:identifier"), + } + + # Extract manifest items (ID → href mapping) + manifest = { + item.getAttribute("id"): item.getAttribute("href") + for item in opf_dom.getElementsByTagName("item") + } + + # Extract spine order (ID refs) + spine_items = opf_dom.getElementsByTagName("itemref") + spine_order = [item.getAttribute("idref") for item in spine_items] + + # Convert spine order to actual file paths + base_path = "/".join( + opf_path.split("/")[:-1] + ) # Get base directory of content.opf + spine = [ + f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id] + for item_id in spine_order + if item_id in manifest + ] + + # Extract and convert the content + markdown_content: List[str] = [] + for file in spine: + if file in z.namelist(): + with z.open(file) as f: + filename = os.path.basename(file) + extension = os.path.splitext(filename)[1].lower() + mimetype = MIME_TYPE_MAPPING.get(extension) + converted_content = self._html_converter.convert( + f, + StreamInfo( + mimetype=mimetype, + extension=extension, + filename=filename, + ), + ) + markdown_content.append(converted_content.markdown.strip()) + + # Format and add the metadata + metadata_markdown = [] + for key, value in metadata.items(): + if isinstance(value, list): + value = ", ".join(value) + if value: + metadata_markdown.append(f"**{key.capitalize()}:** {value}") + + markdown_content.insert(0, "\n".join(metadata_markdown)) + + return DocumentConverterResult( + markdown="\n\n".join(markdown_content), title=metadata["title"] + ) + + def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None: + """Convenience function to extract a single occurrence of a tag (e.g., title).""" + texts = self._get_all_texts_from_nodes(dom, tag_name) + if len(texts) > 0: + return texts[0] + else: + return None + + def _get_all_texts_from_nodes( + self, dom: minidom.Document, tag_name: str + ) -> List[str]: + """Helper function to extract all occurrences of a tag (e.g., multiple authors).""" + texts: List[str] = [] + for node in dom.getElementsByTagName(tag_name): + if node.firstChild and hasattr(node.firstChild, "nodeValue"): + texts.append(node.firstChild.nodeValue.strip()) + return texts diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 5d2b2fc..8610108 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -211,4 +211,22 @@ GENERAL_TEST_VECTORS = [ ], must_not_include=[], ), + FileTestVector( + filename="test.epub", + mimetype="application/epub+zip", + charset=None, + url=None, + must_include=[ + "**Authors:** Test Author", + "A test EPUB document for MarkItDown testing", + "# Chapter 1: Test Content", + "This is a **test** paragraph with some formatting", + "* A bullet point", + "* Another point", + "# Chapter 2: More Content", + "*different* style", + "> This is a blockquote for testing", + ], + must_not_include=[], + ), ]