EPub Support. Adapted #123 to not use epublib. (#1131)

* Adapted #123 to not use epublib. * Updated README.md
2025-03-17 07:48:15 -07:00
parent c5f70b904f
commit ce3206fffe
5 changed files with 171 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
 At present, MarkItDown supports:

 - PDF
- PowerPoint (reading in top-to-bottom, left-to-right order)
+- PowerPoint
 - Word
 - Excel
 - Images (EXIF metadata and OCR)
@@ -23,6 +23,7 @@ At present, MarkItDown supports:
 - Text-based formats (CSV, JSON, XML)
 - ZIP files (iterates over contents)
 - Youtube URLs
+- EPubs
 - ... and more!

 ## Why Markdown?
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -38,6 +38,7 @@ from .converters import (
    AudioConverter,
    OutlookMsgConverter,
    ZipConverter,
+    EpubConverter,
    DocumentIntelligenceConverter,
 )

@@ -191,6 +192,7 @@ class MarkItDown:
            self.register_converter(IpynbConverter())
            self.register_converter(PdfConverter())
            self.register_converter(OutlookMsgConverter())
+            self.register_converter(EpubConverter())

            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -18,6 +18,7 @@ from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._epub_converter import EpubConverter

 __all__ = [
    "PlainTextConverter",
@@ -37,4 +38,5 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
+    "EpubConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -0,0 +1,147 @@
+import os
+import zipfile
+import xml.dom.minidom as minidom
+
+from typing import BinaryIO, Any, Dict, List
+
+from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/epub",
+    "application/epub+zip",
+    "application/x-epub+zip",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".epub"]
+
+MIME_TYPE_MAPPING = {
+    ".html": "text/html",
+    ".xhtml": "application/xhtml+xml",
+}
+
+
+class EpubConverter(HtmlConverter):
+    """
+    Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._html_converter = HtmlConverter()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        with zipfile.ZipFile(file_stream, "r") as z:
+            # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
+
+            # Locate content.opf
+            container_dom = minidom.parse(z.open("META-INF/container.xml"))
+            opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
+                "full-path"
+            )
+
+            # Parse content.opf
+            opf_dom = minidom.parse(z.open(opf_path))
+            metadata: Dict[str, Any] = {
+                "title": self._get_text_from_node(opf_dom, "dc:title"),
+                "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
+                "language": self._get_text_from_node(opf_dom, "dc:language"),
+                "publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
+                "date": self._get_text_from_node(opf_dom, "dc:date"),
+                "description": self._get_text_from_node(opf_dom, "dc:description"),
+                "identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
+            }
+
+            # Extract manifest items (ID → href mapping)
+            manifest = {
+                item.getAttribute("id"): item.getAttribute("href")
+                for item in opf_dom.getElementsByTagName("item")
+            }
+
+            # Extract spine order (ID refs)
+            spine_items = opf_dom.getElementsByTagName("itemref")
+            spine_order = [item.getAttribute("idref") for item in spine_items]
+
+            # Convert spine order to actual file paths
+            base_path = "/".join(
+                opf_path.split("/")[:-1]
+            )  # Get base directory of content.opf
+            spine = [
+                f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
+                for item_id in spine_order
+                if item_id in manifest
+            ]
+
+            # Extract and convert the content
+            markdown_content: List[str] = []
+            for file in spine:
+                if file in z.namelist():
+                    with z.open(file) as f:
+                        filename = os.path.basename(file)
+                        extension = os.path.splitext(filename)[1].lower()
+                        mimetype = MIME_TYPE_MAPPING.get(extension)
+                        converted_content = self._html_converter.convert(
+                            f,
+                            StreamInfo(
+                                mimetype=mimetype,
+                                extension=extension,
+                                filename=filename,
+                            ),
+                        )
+                        markdown_content.append(converted_content.markdown.strip())
+
+            # Format and add the metadata
+            metadata_markdown = []
+            for key, value in metadata.items():
+                if isinstance(value, list):
+                    value = ", ".join(value)
+                if value:
+                    metadata_markdown.append(f"**{key.capitalize()}:** {value}")
+
+            markdown_content.insert(0, "\n".join(metadata_markdown))
+
+            return DocumentConverterResult(
+                markdown="\n\n".join(markdown_content), title=metadata["title"]
+            )
+
+    def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
+        """Convenience function to extract a single occurrence of a tag (e.g., title)."""
+        texts = self._get_all_texts_from_nodes(dom, tag_name)
+        if len(texts) > 0:
+            return texts[0]
+        else:
+            return None
+
+    def _get_all_texts_from_nodes(
+        self, dom: minidom.Document, tag_name: str
+    ) -> List[str]:
+        """Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
+        texts: List[str] = []
+        for node in dom.getElementsByTagName(tag_name):
+            if node.firstChild and hasattr(node.firstChild, "nodeValue"):
+                texts.append(node.firstChild.nodeValue.strip())
+        return texts
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -211,4 +211,22 @@ GENERAL_TEST_VECTORS = [
        ],
        must_not_include=[],
    ),
+    FileTestVector(
+        filename="test.epub",
+        mimetype="application/epub+zip",
+        charset=None,
+        url=None,
+        must_include=[
+            "**Authors:** Test Author",
+            "A test EPUB document for MarkItDown testing",
+            "# Chapter 1: Test Content",
+            "This is a **test** paragraph with some formatting",
+            "* A bullet point",
+            "* Another point",
+            "# Chapter 2: More Content",
+            "*different* style",
+            "> This is a blockquote for testing",
+        ],
+        must_not_include=[],
+    ),
 ]