EPub Support. Adapted #123 to not use epublib. (#1131 )

* Adapted #123 to not use epublib. * Updated README.md
Have magika read from the stream. (#1136 )
2025-03-17 07:48:15 -07:00 · 2025-03-17 07:39:19 -07:00 · 2025-03-15 23:41:35 -07:00 · 2025-03-15 23:12:48 -07:00 · 2025-03-15 18:34:51 -07:00 · 2025-03-12 19:18:11 -07:00
17 changed files with 248 additions and 46 deletions
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
 At present, MarkItDown supports:

 - PDF
- PowerPoint (reading in top-to-bottom, left-to-right order)
+- PowerPoint
 - Word
 - Excel
 - Images (EXIF metadata and OCR)
@@ -23,6 +23,7 @@ At present, MarkItDown supports:
 - Text-based formats (CSV, JSON, XML)
 - ZIP files (iterates over contents)
 - Youtube URLs
+- EPubs
 - ... and more!

 ## Why Markdown?
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
  "beautifulsoup4",
  "requests",
  "markdownify",
-  "magika>=0.6.0rc1",
+  "magika>=0.6.1rc3",
  "charset-normalizer",
 ]

--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a1"
+__version__ = "0.1.0a4"
--- a/packages/markitdown/src/markitdown/main.py
+++ b/packages/markitdown/src/markitdown/main.py
@@ -139,7 +139,7 @@ def main():
        else:
            charset_hint = None

-    stream_info: str | None = None
+    stream_info = None
    if (
        extension_hint is not None
        or mime_type_hint is not None
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -38,6 +38,7 @@ from .converters import (
    AudioConverter,
    OutlookMsgConverter,
    ZipConverter,
+    EpubConverter,
    DocumentIntelligenceConverter,
 )

@@ -191,6 +192,7 @@ class MarkItDown:
            self.register_converter(IpynbConverter())
            self.register_converter(PdfConverter())
            self.register_converter(OutlookMsgConverter())
+            self.register_converter(EpubConverter())

            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
@@ -610,14 +612,16 @@ class MarkItDown:
        # Call magika to guess from the stream
        cur_pos = file_stream.tell()
        try:
-            stream_bytes = file_stream.read()
-
-            result = self._magika.identify_bytes(stream_bytes)
+            result = self._magika.identify_stream(file_stream)
            if result.status == "ok" and result.prediction.output.label != "unknown":
                # If it's text, also guess the charset
                charset = None
                if result.prediction.output.is_text:
-                    charset_result = charset_normalizer.from_bytes(stream_bytes).best()
+                    # Read the first 4k to guess the charset
+                    file_stream.seek(cur_pos)
+                    stream_page = file_stream.read(4096)
+                    charset_result = charset_normalizer.from_bytes(stream_page).best()
+
                    if charset_result is not None:
                        charset = self._normalize_charset(charset_result.encoding)

--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -18,6 +18,7 @@ from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
+from ._epub_converter import EpubConverter

 __all__ = [
    "PlainTextConverter",
@@ -37,4 +38,5 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
+    "EpubConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -1,6 +1,7 @@
 import io
 import re
 import base64
+import binascii
 from urllib.parse import parse_qs, urlparse
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
@@ -60,6 +61,8 @@ class BingSerpConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
+        assert stream_info.url is not None
+
        # Parse the query parameters
        parsed_params = parse_qs(urlparse(stream_info.url).query)
        query = parsed_params.get("q", [""])[0]
@@ -79,6 +82,9 @@ class BingSerpConverter(DocumentConverter):
        _markdownify = _CustomMarkdownify()
        results = list()
        for result in soup.find_all(class_="b_algo"):
+            if not hasattr(result, "find_all"):
+                continue
+
            # Rewrite redirect urls
            for a in result.find_all("a", href=True):
                parsed_href = urlparse(a["href"])
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -0,0 +1,147 @@
+import os
+import zipfile
+import xml.dom.minidom as minidom
+
+from typing import BinaryIO, Any, Dict, List
+
+from ._html_converter import HtmlConverter
+from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._stream_info import StreamInfo
+
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "application/epub",
+    "application/epub+zip",
+    "application/x-epub+zip",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".epub"]
+
+MIME_TYPE_MAPPING = {
+    ".html": "text/html",
+    ".xhtml": "application/xhtml+xml",
+}
+
+
+class EpubConverter(HtmlConverter):
+    """
+    Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._html_converter = HtmlConverter()
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,  # Options to pass to the converter
+    ) -> DocumentConverterResult:
+        with zipfile.ZipFile(file_stream, "r") as z:
+            # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
+
+            # Locate content.opf
+            container_dom = minidom.parse(z.open("META-INF/container.xml"))
+            opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
+                "full-path"
+            )
+
+            # Parse content.opf
+            opf_dom = minidom.parse(z.open(opf_path))
+            metadata: Dict[str, Any] = {
+                "title": self._get_text_from_node(opf_dom, "dc:title"),
+                "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
+                "language": self._get_text_from_node(opf_dom, "dc:language"),
+                "publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
+                "date": self._get_text_from_node(opf_dom, "dc:date"),
+                "description": self._get_text_from_node(opf_dom, "dc:description"),
+                "identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
+            }
+
+            # Extract manifest items (ID → href mapping)
+            manifest = {
+                item.getAttribute("id"): item.getAttribute("href")
+                for item in opf_dom.getElementsByTagName("item")
+            }
+
+            # Extract spine order (ID refs)
+            spine_items = opf_dom.getElementsByTagName("itemref")
+            spine_order = [item.getAttribute("idref") for item in spine_items]
+
+            # Convert spine order to actual file paths
+            base_path = "/".join(
+                opf_path.split("/")[:-1]
+            )  # Get base directory of content.opf
+            spine = [
+                f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
+                for item_id in spine_order
+                if item_id in manifest
+            ]
+
+            # Extract and convert the content
+            markdown_content: List[str] = []
+            for file in spine:
+                if file in z.namelist():
+                    with z.open(file) as f:
+                        filename = os.path.basename(file)
+                        extension = os.path.splitext(filename)[1].lower()
+                        mimetype = MIME_TYPE_MAPPING.get(extension)
+                        converted_content = self._html_converter.convert(
+                            f,
+                            StreamInfo(
+                                mimetype=mimetype,
+                                extension=extension,
+                                filename=filename,
+                            ),
+                        )
+                        markdown_content.append(converted_content.markdown.strip())
+
+            # Format and add the metadata
+            metadata_markdown = []
+            for key, value in metadata.items():
+                if isinstance(value, list):
+                    value = ", ".join(value)
+                if value:
+                    metadata_markdown.append(f"**{key.capitalize()}:** {value}")
+
+            markdown_content.insert(0, "\n".join(metadata_markdown))
+
+            return DocumentConverterResult(
+                markdown="\n\n".join(markdown_content), title=metadata["title"]
+            )
+
+    def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
+        """Convenience function to extract a single occurrence of a tag (e.g., title)."""
+        texts = self._get_all_texts_from_nodes(dom, tag_name)
+        if len(texts) > 0:
+            return texts[0]
+        else:
+            return None
+
+    def _get_all_texts_from_nodes(
+        self, dom: minidom.Document, tag_name: str
+    ) -> List[str]:
+        """Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
+        texts: List[str] = []
+        for node in dom.getElementsByTagName(tag_name):
+            if node.firstChild and hasattr(node.firstChild, "nodeValue"):
+                texts.append(node.firstChild.nodeValue.strip())
+        return texts
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 _dependency_exc_info = None
 olefile = None
 try:
-    import olefile
+    import olefile  # type: ignore[no-redef]
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
@@ -56,12 +56,13 @@ class OutlookMsgConverter(DocumentConverter):

        # Brue force, check if it's an Outlook file
        try:
-            msg = olefile.OleFileIO(file_stream)
-            toc = "\n".join([str(stream) for stream in msg.listdir()])
-            return (
-                "__properties_version1.0" in toc
-                and "__recip_version1.0_#00000000" in toc
-            )
+            if olefile is not None:
+                msg = olefile.OleFileIO(file_stream)
+                toc = "\n".join([str(stream) for stream in msg.listdir()])
+                return (
+                    "__properties_version1.0" in toc
+                    and "__recip_version1.0_#00000000" in toc
+                )
        except Exception as e:
            pass
        finally:
@@ -89,7 +90,11 @@ class OutlookMsgConverter(DocumentConverter):
                _dependency_exc_info[2]
            )

+        assert (
+            olefile is not None
+        )  # If we made it this far, olefile should be available
        msg = olefile.OleFileIO(file_stream)
+
        # Extract email metadata
        md_content = "# Email Message\n\n"

@@ -121,6 +126,7 @@ class OutlookMsgConverter(DocumentConverter):

    def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
        """Helper to safely extract and decode stream data from the MSG file."""
+        assert olefile is not None
        assert isinstance(
            msg, olefile.OleFileIO
        )  # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -66,7 +66,7 @@ class RssConverter(DocumentConverter):
            file_stream.seek(cur_pos)
        return False

-    def _feed_type(self, doc: Any) -> str:
+    def _feed_type(self, doc: Any) -> str | None:
        if doc.getElementsByTagName("rss"):
            return "rss"
        elif doc.getElementsByTagName("feed"):
@@ -130,10 +130,10 @@ class RssConverter(DocumentConverter):
        Returns None if the feed type is not recognized or something goes wrong.
        """
        root = doc.getElementsByTagName("rss")[0]
-        channel = root.getElementsByTagName("channel")
-        if not channel:
-            return None
-        channel = channel[0]
+        channel_list = root.getElementsByTagName("channel")
+        if not channel_list:
+            raise ValueError("No channel found in RSS feed")
+        channel = channel_list[0]
        channel_title = self._get_data_by_tag_name(channel, "title")
        channel_description = self._get_data_by_tag_name(channel, "description")
        items = channel.getElementsByTagName("item")
@@ -141,8 +141,6 @@ class RssConverter(DocumentConverter):
            md_text = f"# {channel_title}\n"
        if channel_description:
            md_text += f"{channel_description}\n"
-        if not items:
-            items = []
        for item in items:
            title = self._get_data_by_tag_name(item, "title")
            description = self._get_data_by_tag_name(item, "description")
@@ -183,5 +181,6 @@ class RssConverter(DocumentConverter):
            return None
        fc = nodes[0].firstChild
        if fc:
-            return fc.data
+            if hasattr(fc, "data"):
+                return fc.data
        return None
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -7,7 +7,19 @@ from .._exceptions import MissingDependencyException
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
+    # Suppress some deprecation warnings from the speech_recognition library
+    import warnings
+
+    warnings.filterwarnings(
+        "ignore", category=DeprecationWarning, module="speech_recognition"
+    )
+    warnings.filterwarnings(
+        "ignore",
+        category=SyntaxWarning,
+        module="pydub",  # TODO: Migrate away from pydub
+    )
    import speech_recognition as sr
+
    import pydub
 except ImportError:
    # Preserve the error and stack trace for later
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -1,7 +1,7 @@
 import io
 import re
+import bs4
 from typing import Any, BinaryIO, Optional
-from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
-        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
@@ -72,9 +72,8 @@ class WikipediaConverter(DocumentConverter):

        if body_elm:
            # What's the title
-            if title_elm and len(title_elm) > 0:
-                main_title = title_elm.string  # type: ignore
-                assert isinstance(main_title, str)
+            if title_elm and isinstance(title_elm, bs4.Tag):
+                main_title = title_elm.string

            # Convert the page
            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -3,9 +3,10 @@ import json
 import time
 import io
 import re
+import bs4
+import warnings
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote
-from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
@@ -13,6 +14,11 @@ from ._markdownify import _CustomMarkdownify

 # Optional YouTube transcription support
 try:
+    warnings.filterwarnings(
+        "ignore",
+        category=SyntaxWarning,
+        module="youtube_transcript_api",  # Patch submitted to youtube-transcript-api
+    )
    from youtube_transcript_api import YouTubeTranscriptApi

    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
@@ -72,21 +78,31 @@ class YouTubeConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
-        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Read the meta tags
-        metadata: Dict[str, str] = {"title": soup.title.string}
+        metadata: Dict[str, str] = {}
+
+        if soup.title and soup.title.string:
+            metadata["title"] = soup.title.string
+
        for meta in soup(["meta"]):
+            if not isinstance(meta, bs4.Tag):
+                continue
+
            for a in meta.attrs:
                if a in ["itemprop", "property", "name"]:
-                    content = meta.get("content", "")
-                    if content:  # Only add non-empty content
-                        metadata[meta[a]] = content
+                    key = str(meta.get(a, ""))
+                    content = str(meta.get("content", ""))
+                    if key and content:  # Only add non-empty content
+                        metadata[key] = content
                    break

        # Try reading the description
        try:
            for script in soup(["script"]):
+                if not isinstance(script, bs4.Tag):
+                    continue
                if not script.string:  # Skip empty scripts
                    continue
                content = script.string
@@ -161,7 +177,7 @@ class YouTubeConverter(DocumentConverter):
            if transcript_text:
                webpage_text += f"\n### Transcript\n{transcript_text}\n"

-        title = title if title else soup.title.string
+        title = title if title else (soup.title.string if soup.title else "")
        assert isinstance(title, str)

        return DocumentConverterResult(
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -211,4 +211,22 @@ GENERAL_TEST_VECTORS = [
        ],
        must_not_include=[],
    ),
+    FileTestVector(
+        filename="test.epub",
+        mimetype="application/epub+zip",
+        charset=None,
+        url=None,
+        must_include=[
+            "**Authors:** Test Author",
+            "A test EPUB document for MarkItDown testing",
+            "# Chapter 1: Test Content",
+            "This is a **test** paragraph with some formatting",
+            "* A bullet point",
+            "* Another point",
+            "# Chapter 2: More Content",
+            "*different* style",
+            "> This is a blockquote for testing",
+        ],
+        must_not_include=[],
+    ),
 ]
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@@ -114,7 +114,9 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
    )

    stdout = result.stdout.decode(locale.getpreferredencoding())
-    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+    assert (
+        result.returncode == 0
+    ), f"CLI exited with error: {result.stderr.decode('utf-8')}"
    for test_string in test_vector.must_include:
        assert test_string in stdout
    for test_string in test_vector.must_not_include:
--- a/packages/markitdown/tests/test_files/test.epub
+++ b/packages/markitdown/tests/test_files/test.epub
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@@ -47,7 +47,6 @@ def test_guess_stream_info(test_vector):
        # mimetype or extension, so we'll special-case them here.
        if test_vector.filename in [
            "test_outlook_msg.msg",
-            "test_mskanji.csv",  # See: https://github.com/google/magika/issues/983
        ]:
            return

@@ -96,15 +95,6 @@ def test_convert_stream_without_hints(test_vector):
    """Test the conversion of a stream with no stream info."""
    markitdown = MarkItDown()

-    # For some limited exceptions, we can't guarantee the exact
-    # mimetype or extension, so we'll special-case them here.
-    if test_vector.filename in [
-        # This appears to be a subtle bug in magika.
-        # See: https://github.com/google/magika/issues/983
-        "test_mskanji.csv",
-    ]:
-        return
-
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        result = markitdown.convert(stream, url=test_vector.url)
        for string in test_vector.must_include:
Author	SHA1	Message	Date
afourney	a93e0567e6	EPub Support. Adapted #123 to not use epublib. (#1131 ) * Adapted #123 to not use epublib. * Updated README.md	2025-03-17 07:48:15 -07:00
afourney	c5f70b904f	Have magika read from the stream. (#1136 )	2025-03-17 07:39:19 -07:00
afourney	53834fdd24	Investigate and silence warnings. (#1133 )	2025-03-15 23:41:35 -07:00
afourney	5c565b7d79	Fix remaining mypy errors. (#1132 )	2025-03-15 23:12:48 -07:00
afourney	a78857bd43	Added epub test file. (#1130 )	2025-03-15 18:34:51 -07:00
afourney	09df7fe8df	Small fixes for autogen integration. (#1124 )	2025-03-12 19:18:11 -07:00
Adam Fourney	6a9f09b153	Updated Magika dependency.	2025-03-12 16:15:33 -07:00
afourney	0b815fb916	Bumping version to 0.1.0a2 (#1123 )	2025-03-12 11:44:19 -07:00