Bumping version to 0.1.0a2

2025-03-12 11:42:00 -07:00
24 changed files with 91 additions and 487 deletions
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
 At present, MarkItDown supports:

 - PDF
- PowerPoint
+- PowerPoint (reading in top-to-bottom, left-to-right order)
 - Word
 - Excel
 - Images (EXIF metadata and OCR)
@@ -23,7 +23,6 @@ At present, MarkItDown supports:
 - Text-based formats (CSV, JSON, XML)
 - ZIP files (iterates over contents)
 - Youtube URLs
- EPubs
 - ... and more!

 ## Why Markdown?
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
  "beautifulsoup4",
  "requests",
  "markdownify",
-  "magika~=0.6.1",
+  "magika>=0.6.0rc1",
  "charset-normalizer",
 ]

@@ -42,7 +42,7 @@ all = [
  "olefile",
  "pydub",
  "SpeechRecognition",
-  "youtube-transcript-api~=1.0.0",
+  "youtube-transcript-api",
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a6"
+__version__ = "0.1.0a2"
--- a/packages/markitdown/src/markitdown/main.py
+++ b/packages/markitdown/src/markitdown/main.py
@@ -4,7 +4,6 @@
 import argparse
 import sys
 import codecs
-import locale
 from textwrap import dedent
 from importlib.metadata import entry_points
 from .__about__ import __version__
@@ -105,12 +104,6 @@ def main():
        help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
    )

-    parser.add_argument(
-        "--keep-data-uris",
-        action="store_true",
-        help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
-    )
-
    parser.add_argument("filename", nargs="?")
    args = parser.parse_args()

@@ -146,7 +139,7 @@ def main():
        else:
            charset_hint = None

-    stream_info = None
+    stream_info: str | None = None
    if (
        extension_hint is not None
        or mime_type_hint is not None
@@ -188,15 +181,9 @@ def main():
        markitdown = MarkItDown(enable_plugins=args.use_plugins)

    if args.filename is None:
-        result = markitdown.convert_stream(
-            sys.stdin.buffer,
-            stream_info=stream_info,
-            keep_data_uris=args.keep_data_uris,
-        )
+        result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
    else:
-        result = markitdown.convert(
-            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
-        )
+        result = markitdown.convert(args.filename, stream_info=stream_info)

    _handle_output(args, result)

@@ -205,14 +192,9 @@ def _handle_output(args, result: DocumentConverterResult):
    """Handle output to stdout or file"""
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
-            f.write(result.markdown)
+            f.write(result.text_content)
    else:
-        # Handle stdout encoding errors more gracefully
-        print(
-            result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
-                sys.stdout.encoding
-            )
-        )
+        print(result.text_content)


 def _exit_with_error(message: str):
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -38,7 +38,6 @@ from .converters import (
    AudioConverter,
    OutlookMsgConverter,
    ZipConverter,
-    EpubConverter,
    DocumentIntelligenceConverter,
 )

@@ -192,7 +191,6 @@ class MarkItDown:
            self.register_converter(IpynbConverter())
            self.register_converter(PdfConverter())
            self.register_converter(OutlookMsgConverter())
-            self.register_converter(EpubConverter())

            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
@@ -612,16 +610,14 @@ class MarkItDown:
        # Call magika to guess from the stream
        cur_pos = file_stream.tell()
        try:
-            result = self._magika.identify_stream(file_stream)
+            stream_bytes = file_stream.read()
+
+            result = self._magika.identify_bytes(stream_bytes)
            if result.status == "ok" and result.prediction.output.label != "unknown":
                # If it's text, also guess the charset
                charset = None
                if result.prediction.output.is_text:
-                    # Read the first 4k to guess the charset
-                    file_stream.seek(cur_pos)
-                    stream_page = file_stream.read(4096)
-                    charset_result = charset_normalizer.from_bytes(stream_page).best()
-
+                    charset_result = charset_normalizer.from_bytes(stream_bytes).best()
                    if charset_result is not None:
                        charset = self._normalize_charset(charset_result.encoding)

--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -18,7 +18,6 @@ from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
-from ._epub_converter import EpubConverter

 __all__ = [
    "PlainTextConverter",
@@ -38,5 +37,4 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
-    "EpubConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -1,7 +1,6 @@
 import io
 import re
 import base64
-import binascii
 from urllib.parse import parse_qs, urlparse
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
@@ -61,8 +60,6 @@ class BingSerpConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        assert stream_info.url is not None
-
        # Parse the query parameters
        parsed_params = parse_qs(urlparse(stream_info.url).query)
        query = parsed_params.get("q", [""])[0]
@@ -79,12 +76,9 @@ class BingSerpConverter(DocumentConverter):
            slug.extract()

        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify(**kwargs)
+        _markdownify = _CustomMarkdownify()
        results = list()
        for result in soup.find_all(class_="b_algo"):
-            if not hasattr(result, "find_all"):
-                continue
-
            # Rewrite redirect urls
            for a in result.find_all("a", href=True):
                parsed_href = urlparse(a["href"])
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):

        style_map = kwargs.get("style_map", None)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
+            mammoth.convert_to_html(file_stream, style_map=style_map).value
        )
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,147 +0,0 @@
-import os
-import zipfile
-import xml.dom.minidom as minidom
-
-from typing import BinaryIO, Any, Dict, List
-
-from ._html_converter import HtmlConverter
-from .._base_converter import DocumentConverter, DocumentConverterResult
-from .._stream_info import StreamInfo
-
-ACCEPTED_MIME_TYPE_PREFIXES = [
-    "application/epub",
-    "application/epub+zip",
-    "application/x-epub+zip",
-]
-
-ACCEPTED_FILE_EXTENSIONS = [".epub"]
-
-MIME_TYPE_MAPPING = {
-    ".html": "text/html",
-    ".xhtml": "application/xhtml+xml",
-}
-
-
-class EpubConverter(HtmlConverter):
-    """
-    Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._html_converter = HtmlConverter()
-
-    def accepts(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> bool:
-        mimetype = (stream_info.mimetype or "").lower()
-        extension = (stream_info.extension or "").lower()
-
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
-
-        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return True
-
-        return False
-
-    def convert(
-        self,
-        file_stream: BinaryIO,
-        stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
-    ) -> DocumentConverterResult:
-        with zipfile.ZipFile(file_stream, "r") as z:
-            # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
-
-            # Locate content.opf
-            container_dom = minidom.parse(z.open("META-INF/container.xml"))
-            opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
-                "full-path"
-            )
-
-            # Parse content.opf
-            opf_dom = minidom.parse(z.open(opf_path))
-            metadata: Dict[str, Any] = {
-                "title": self._get_text_from_node(opf_dom, "dc:title"),
-                "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
-                "language": self._get_text_from_node(opf_dom, "dc:language"),
-                "publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
-                "date": self._get_text_from_node(opf_dom, "dc:date"),
-                "description": self._get_text_from_node(opf_dom, "dc:description"),
-                "identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
-            }
-
-            # Extract manifest items (ID → href mapping)
-            manifest = {
-                item.getAttribute("id"): item.getAttribute("href")
-                for item in opf_dom.getElementsByTagName("item")
-            }
-
-            # Extract spine order (ID refs)
-            spine_items = opf_dom.getElementsByTagName("itemref")
-            spine_order = [item.getAttribute("idref") for item in spine_items]
-
-            # Convert spine order to actual file paths
-            base_path = "/".join(
-                opf_path.split("/")[:-1]
-            )  # Get base directory of content.opf
-            spine = [
-                f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
-                for item_id in spine_order
-                if item_id in manifest
-            ]
-
-            # Extract and convert the content
-            markdown_content: List[str] = []
-            for file in spine:
-                if file in z.namelist():
-                    with z.open(file) as f:
-                        filename = os.path.basename(file)
-                        extension = os.path.splitext(filename)[1].lower()
-                        mimetype = MIME_TYPE_MAPPING.get(extension)
-                        converted_content = self._html_converter.convert(
-                            f,
-                            StreamInfo(
-                                mimetype=mimetype,
-                                extension=extension,
-                                filename=filename,
-                            ),
-                        )
-                        markdown_content.append(converted_content.markdown.strip())
-
-            # Format and add the metadata
-            metadata_markdown = []
-            for key, value in metadata.items():
-                if isinstance(value, list):
-                    value = ", ".join(value)
-                if value:
-                    metadata_markdown.append(f"**{key.capitalize()}:** {value}")
-
-            markdown_content.insert(0, "\n".join(metadata_markdown))
-
-            return DocumentConverterResult(
-                markdown="\n\n".join(markdown_content), title=metadata["title"]
-            )
-
-    def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
-        """Convenience function to extract a single occurrence of a tag (e.g., title)."""
-        texts = self._get_all_texts_from_nodes(dom, tag_name)
-        if len(texts) > 0:
-            return texts[0]
-        else:
-            return None
-
-    def _get_all_texts_from_nodes(
-        self, dom: minidom.Document, tag_name: str
-    ) -> List[str]:
-        """Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
-        texts: List[str] = []
-        for node in dom.getElementsByTagName(tag_name):
-            if node.firstChild and hasattr(node.firstChild, "nodeValue"):
-                texts.append(node.firstChild.nodeValue.strip())
-        return texts
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+            webpage_text = _CustomMarkdownify().convert_soup(soup)

        assert isinstance(webpage_text, str)

--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -17,7 +17,6 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
-        options["keep_data_uris"] = options.get("keep_data_uris", False)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)

@@ -102,7 +101,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            return alt

        # Remove dataURIs
-        if src.startswith("data:") and not self.options["keep_data_uris"]:
+        if src.startswith("data:"):
            src = src.split(",")[0] + "..."

        return "![%s](%s%s)" % (alt, src, title_part)
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 _dependency_exc_info = None
 olefile = None
 try:
-    import olefile  # type: ignore[no-redef]
+    import olefile
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
@@ -56,13 +56,12 @@ class OutlookMsgConverter(DocumentConverter):

        # Brue force, check if it's an Outlook file
        try:
-            if olefile is not None:
-                msg = olefile.OleFileIO(file_stream)
-                toc = "\n".join([str(stream) for stream in msg.listdir()])
-                return (
-                    "__properties_version1.0" in toc
-                    and "__recip_version1.0_#00000000" in toc
-                )
+            msg = olefile.OleFileIO(file_stream)
+            toc = "\n".join([str(stream) for stream in msg.listdir()])
+            return (
+                "__properties_version1.0" in toc
+                and "__recip_version1.0_#00000000" in toc
+            )
        except Exception as e:
            pass
        finally:
@@ -90,11 +89,7 @@ class OutlookMsgConverter(DocumentConverter):
                _dependency_exc_info[2]
            )

-        assert (
-            olefile is not None
-        )  # If we made it this far, olefile should be available
        msg = olefile.OleFileIO(file_stream)
-
        # Extract email metadata
        md_content = "# Email Message\n\n"

@@ -126,7 +121,6 @@ class OutlookMsgConverter(DocumentConverter):

    def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
        """Helper to safely extract and decode stream data from the MSG file."""
-        assert olefile is not None
        assert isinstance(
            msg, olefile.OleFileIO
        )  # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -17,16 +17,12 @@ except ImportError:
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/",
    "application/json",
-    "application/markdown",
 ]

-ACCEPTED_FILE_EXTENSIONS = [
-    ".txt",
-    ".text",
-    ".md",
-    ".markdown",
-    ".json",
-    ".jsonl",
+# Mimetypes to ignore (commonly confused extensions)
+IGNORE_MIME_TYPE_PREFIXES = [
+    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
+    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
 ]


@@ -42,14 +38,9 @@ class PlainTextConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()

-        # If we have a charset, we can safely assume it's text
-        # With Magika in the earlier stages, this handles most cases
-        if stream_info.charset is not None:
-            return True
-
-        # Otherwise, check the mimetype and extension
-        if extension in ACCEPTED_FILE_EXTENSIONS:
-            return True
+        for prefix in IGNORE_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return False

        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -140,20 +140,13 @@ class PptxConverter(DocumentConverter):
                    alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()

-                    # If keep_data_uris is True, use base64 encoding for images
-                    if kwargs.get("keep_data_uris", False):
-                        blob = shape.image.blob
-                        content_type = shape.image.content_type or "image/png"
-                        b64_string = base64.b64encode(blob).decode("utf-8")
-                        md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
-                    else:
-                        # A placeholder name
-                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                        md_content += "\n![" + alt_text + "](" + filename + ")\n"
+                    # A placeholder name
+                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                    md_content += "\n![" + alt_text + "](" + filename + ")\n"

                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)
+                    md_content += self._convert_table_to_markdown(shape.table)

                # Charts
                if shape.has_chart:
@@ -200,7 +193,7 @@ class PptxConverter(DocumentConverter):
            return True
        return False

-    def _convert_table_to_markdown(self, table, **kwargs):
+    def _convert_table_to_markdown(self, table):
        # Write the table as HTML, then convert it to Markdown
        html_table = "<html><body><table>"
        first_row = True
@@ -215,10 +208,7 @@ class PptxConverter(DocumentConverter):
            first_row = False
        html_table += "</table></body></html>"

-        return (
-            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
-            + "\n"
-        )
+        return self._html_converter.convert_string(html_table).markdown.strip() + "\n"

    def _convert_chart_to_markdown(self, chart):
        try:
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -28,10 +28,6 @@ CANDIDATE_FILE_EXTENSIONS = [
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""

-    def __init__(self):
-        super().__init__()
-        self._kwargs = {}
-
    def accepts(
        self,
        file_stream: BinaryIO,
@@ -70,7 +66,7 @@ class RssConverter(DocumentConverter):
            file_stream.seek(cur_pos)
        return False

-    def _feed_type(self, doc: Any) -> str | None:
+    def _feed_type(self, doc: Any) -> str:
        if doc.getElementsByTagName("rss"):
            return "rss"
        elif doc.getElementsByTagName("feed"):
@@ -86,7 +82,6 @@ class RssConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
-        self._kwargs = kwargs
        doc = minidom.parse(file_stream)
        feed_type = self._feed_type(doc)

@@ -135,10 +130,10 @@ class RssConverter(DocumentConverter):
        Returns None if the feed type is not recognized or something goes wrong.
        """
        root = doc.getElementsByTagName("rss")[0]
-        channel_list = root.getElementsByTagName("channel")
-        if not channel_list:
-            raise ValueError("No channel found in RSS feed")
-        channel = channel_list[0]
+        channel = root.getElementsByTagName("channel")
+        if not channel:
+            return None
+        channel = channel[0]
        channel_title = self._get_data_by_tag_name(channel, "title")
        channel_description = self._get_data_by_tag_name(channel, "description")
        items = channel.getElementsByTagName("item")
@@ -146,6 +141,8 @@ class RssConverter(DocumentConverter):
            md_text = f"# {channel_title}\n"
        if channel_description:
            md_text += f"{channel_description}\n"
+        if not items:
+            items = []
        for item in items:
            title = self._get_data_by_tag_name(item, "title")
            description = self._get_data_by_tag_name(item, "description")
@@ -171,7 +168,7 @@ class RssConverter(DocumentConverter):
        try:
            # using bs4 because many RSS feeds have HTML-styled content
            soup = BeautifulSoup(content, "html.parser")
-            return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
+            return _CustomMarkdownify().convert_soup(soup)
        except BaseException as _:
            return content

@@ -186,6 +183,5 @@ class RssConverter(DocumentConverter):
            return None
        fc = nodes[0].firstChild
        if fc:
-            if hasattr(fc, "data"):
-                return fc.data
+            return fc.data
        return None
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -7,14 +7,8 @@ from .._exceptions import MissingDependencyException
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
-    # Suppress some warnings on library import
-    import warnings
-
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=DeprecationWarning)
-        warnings.filterwarnings("ignore", category=SyntaxWarning)
-        import speech_recognition as sr
-        import pydub
+    import speech_recognition as sr
+    import pydub
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -1,7 +1,7 @@
 import io
 import re
-import bs4
 from typing import Any, BinaryIO, Optional
+from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
-        soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
@@ -72,15 +72,16 @@ class WikipediaConverter(DocumentConverter):

        if body_elm:
            # What's the title
-            if title_elm and isinstance(title_elm, bs4.Tag):
-                main_title = title_elm.string
+            if title_elm and len(title_elm) > 0:
+                main_title = title_elm.string  # type: ignore
+                assert isinstance(main_title, str)

            # Convert the page
-            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
-                **kwargs
-            ).convert_soup(body_elm)
+            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
+                body_elm
+            )
        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+            webpage_text = _CustomMarkdownify().convert_soup(soup)

        return DocumentConverterResult(
            markdown=webpage_text,
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -86,9 +86,7 @@ class XlsxConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
+                self._html_converter.convert_string(html_content).markdown.strip()
                + "\n\n"
            )

@@ -148,9 +146,7 @@ class XlsConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(
-                    html_content, **kwargs
-                ).markdown.strip()
+                self._html_converter.convert_string(html_content).markdown.strip()
                + "\n\n"
            )

--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -3,22 +3,17 @@ import json
 import time
 import io
 import re
-import bs4
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote
+from bs4 import BeautifulSoup

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
+from ._markdownify import _CustomMarkdownify

 # Optional YouTube transcription support
 try:
-    # Suppress some warnings on library import
-    import warnings
-
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=SyntaxWarning)
-        # Patch submitted upstream to fix the SyntaxWarning
-        from youtube_transcript_api import YouTubeTranscriptApi
+    from youtube_transcript_api import YouTubeTranscriptApi

    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 except ModuleNotFoundError:
@@ -77,31 +72,21 @@ class YouTubeConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
-        soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)

        # Read the meta tags
-        metadata: Dict[str, str] = {}
-
-        if soup.title and soup.title.string:
-            metadata["title"] = soup.title.string
-
+        metadata: Dict[str, str] = {"title": soup.title.string}
        for meta in soup(["meta"]):
-            if not isinstance(meta, bs4.Tag):
-                continue
-
            for a in meta.attrs:
                if a in ["itemprop", "property", "name"]:
-                    key = str(meta.get(a, ""))
-                    content = str(meta.get("content", ""))
-                    if key and content:  # Only add non-empty content
-                        metadata[key] = content
+                    content = meta.get("content", "")
+                    if content:  # Only add non-empty content
+                        metadata[meta[a]] = content
                    break

        # Try reading the description
        try:
            for script in soup(["script"]):
-                if not isinstance(script, bs4.Tag):
-                    continue
                if not script.string:  # Skip empty scripts
                    continue
                content = script.string
@@ -147,7 +132,6 @@ class YouTubeConverter(DocumentConverter):
            webpage_text += f"\n### Description\n{description}\n"

        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
-            ytt_api = YouTubeTranscriptApi()
            transcript_text = ""
            parsed_url = urlparse(stream_info.url)  # type: ignore
            params = parse_qs(parsed_url.query)  # type: ignore
@@ -159,7 +143,7 @@ class YouTubeConverter(DocumentConverter):
                    )
                    # Retry the transcript fetching operation
                    transcript = self._retry_operation(
-                        lambda: ytt_api.fetch(
+                        lambda: YouTubeTranscriptApi.get_transcript(
                            video_id, languages=youtube_transcript_languages
                        ),
                        retries=3,  # Retry 3 times
@@ -167,14 +151,17 @@ class YouTubeConverter(DocumentConverter):
                    )
                    if transcript:
                        transcript_text = " ".join(
-                            [part.text for part in transcript]
+                            [part["text"] for part in transcript]
                        )  # type: ignore
+                    # Alternative formatting:
+                    # formatter = TextFormatter()
+                    # formatter.format_transcript(transcript)
                except Exception as e:
                    print(f"Error fetching transcript: {e}")
            if transcript_text:
                webpage_text += f"\n### Transcript\n{transcript_text}\n"

-        title = title if title else (soup.title.string if soup.title else "")
+        title = title if title else soup.title.string
        assert isinstance(title, str)

        return DocumentConverterResult(
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -25,11 +25,8 @@ GENERAL_TEST_VECTORS = [
            "# Abstract",
            "# Introduction",
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
-            "data:image/png;base64...",
-        ],
-        must_not_include=[
-            "data:image/png;base64,iVBORw0KGgoAAAANSU",
        ],
+        must_not_include=[],
    ),
    FileTestVector(
        filename="test.xlsx",
@@ -68,9 +65,8 @@ GENERAL_TEST_VECTORS = [
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
            "2003",  # chart value
-            "![This phrase of the caption is Human-written.](Picture4.jpg)",
        ],
-        must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
+        must_not_include=[],
    ),
    FileTestVector(
        filename="test_outlook_msg.msg",
@@ -215,64 +211,4 @@ GENERAL_TEST_VECTORS = [
        ],
        must_not_include=[],
    ),
-    FileTestVector(
-        filename="test.epub",
-        mimetype="application/epub+zip",
-        charset=None,
-        url=None,
-        must_include=[
-            "**Authors:** Test Author",
-            "A test EPUB document for MarkItDown testing",
-            "# Chapter 1: Test Content",
-            "This is a **test** paragraph with some formatting",
-            "* A bullet point",
-            "* Another point",
-            "# Chapter 2: More Content",
-            "*different* style",
-            "> This is a blockquote for testing",
-        ],
-        must_not_include=[],
-    ),
-]
-
-
-DATA_URI_TEST_VECTORS = [
-    FileTestVector(
-        filename="test.docx",
-        mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        charset=None,
-        url=None,
-        must_include=[
-            "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
-            "49e168b7-d2ae-407f-a055-2167576f39a1",
-            "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
-            "# Abstract",
-            "# Introduction",
-            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
-            "data:image/png;base64,iVBORw0KGgoAAAANSU",
-        ],
-        must_not_include=[
-            "data:image/png;base64...",
-        ],
-    ),
-    FileTestVector(
-        filename="test.pptx",
-        mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        charset=None,
-        url=None,
-        must_include=[
-            "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
-            "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
-            "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
-            "1b92870d-e3b5-4e65-8153-919f4ff45592",
-            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
-            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
-            "2003",  # chart value
-            "![This phrase of the caption is Human-written.]",  # image caption
-            "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
-        ],
-        must_not_include=[
-            "![This phrase of the caption is Human-written.](Picture4.jpg)",
-        ],
-    ),
 ]
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@@ -7,17 +7,9 @@ import locale
 from typing import List

 if __name__ == "__main__":
-    from _test_vectors import (
-        GENERAL_TEST_VECTORS,
-        DATA_URI_TEST_VECTORS,
-        FileTestVector,
-    )
+    from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
 else:
-    from ._test_vectors import (
-        GENERAL_TEST_VECTORS,
-        DATA_URI_TEST_VECTORS,
-        FileTestVector,
-    )
+    from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector

 from markitdown import (
    MarkItDown,
@@ -122,9 +114,7 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
    )

    stdout = result.stdout.decode(locale.getpreferredencoding())
-    assert (
-        result.returncode == 0
-    ), f"CLI exited with error: {result.stderr.decode('utf-8')}"
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
    for test_string in test_vector.must_include:
        assert test_string in stdout
    for test_string in test_vector.must_not_include:
@@ -157,39 +147,6 @@ def test_convert_url(shared_tmp_dir, test_vector):
        assert test_string not in stdout


-@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
-def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
-    """Test CLI functionality when keep_data_uris is enabled"""
-
-    output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
-    result = subprocess.run(
-        [
-            "python",
-            "-m",
-            "markitdown",
-            "--keep-data-uris",
-            "-o",
-            output_file,
-            os.path.join(TEST_FILES_DIR, test_vector.filename),
-        ],
-        capture_output=True,
-        text=True,
-    )
-
-    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
-    assert os.path.exists(output_file), f"Output file not created: {output_file}"
-
-    with open(output_file, "r") as f:
-        output_data = f.read()
-        for test_string in test_vector.must_include:
-            assert test_string in output_data
-        for test_string in test_vector.must_not_include:
-            assert test_string not in output_data
-
-    os.remove(output_file)
-    assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
-
-
 if __name__ == "__main__":
    import sys
    import tempfile
@@ -197,7 +154,6 @@ if __name__ == "__main__":
    """Runs this file's tests from the command line."""

    with tempfile.TemporaryDirectory() as tmp_dir:
-        # General tests
        for test_function in [
            test_output_to_stdout,
            test_output_to_file,
@@ -211,17 +167,4 @@ if __name__ == "__main__":
                )
                test_function(tmp_dir, test_vector)
                print("OK")
-
-        # Data URI tests
-        for test_function in [
-            test_output_to_file_with_data_uris,
-        ]:
-            for test_vector in DATA_URI_TEST_VECTORS:
-                print(
-                    f"Running {test_function.__name__} on {test_vector.filename}...",
-                    end="",
-                )
-                test_function(tmp_dir, test_vector)
-                print("OK")
-
    print("All tests passed!")
--- a/packages/markitdown/tests/test_files/test.docx
+++ b/packages/markitdown/tests/test_files/test.docx
--- a/packages/markitdown/tests/test_files/test.epub
+++ b/packages/markitdown/tests/test_files/test.epub
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@@ -6,9 +6,9 @@ import codecs


 if __name__ == "__main__":
-    from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
+    from _test_vectors import GENERAL_TEST_VECTORS
 else:
-    from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
+    from ._test_vectors import GENERAL_TEST_VECTORS

 from markitdown import (
    MarkItDown,
@@ -47,6 +47,7 @@ def test_guess_stream_info(test_vector):
        # mimetype or extension, so we'll special-case them here.
        if test_vector.filename in [
            "test_outlook_msg.msg",
+            "test_mskanji.csv",  # See: https://github.com/google/magika/issues/983
        ]:
            return

@@ -95,6 +96,15 @@ def test_convert_stream_without_hints(test_vector):
    """Test the conversion of a stream with no stream info."""
    markitdown = MarkItDown()

+    # For some limited exceptions, we can't guarantee the exact
+    # mimetype or extension, so we'll special-case them here.
+    if test_vector.filename in [
+        # This appears to be a subtle bug in magika.
+        # See: https://github.com/google/magika/issues/983
+        "test_mskanji.csv",
+    ]:
+        return
+
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        result = markitdown.convert(stream, url=test_vector.url)
        for string in test_vector.must_include:
@@ -124,52 +134,10 @@ def test_convert_url(test_vector):
        assert string not in result.markdown


-@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
-def test_convert_with_data_uris(test_vector):
-    """Test API functionality when keep_data_uris is enabled"""
-    markitdown = MarkItDown()
-
-    # Test local file conversion
-    result = markitdown.convert(
-        os.path.join(TEST_FILES_DIR, test_vector.filename),
-        keep_data_uris=True,
-        url=test_vector.url,
-    )
-
-    for string in test_vector.must_include:
-        assert string in result.markdown
-    for string in test_vector.must_not_include:
-        assert string not in result.markdown
-
-
-@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
-def test_convert_stream_with_data_uris(test_vector):
-    """Test the conversion of a stream with no stream info."""
-    markitdown = MarkItDown()
-
-    stream_info = StreamInfo(
-        extension=os.path.splitext(test_vector.filename)[1],
-        mimetype=test_vector.mimetype,
-        charset=test_vector.charset,
-    )
-
-    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
-        result = markitdown.convert(
-            stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
-        )
-
-        for string in test_vector.must_include:
-            assert string in result.markdown
-        for string in test_vector.must_not_include:
-            assert string not in result.markdown
-
-
 if __name__ == "__main__":
    import sys

    """Runs this file's tests from the command line."""
-
-    # General tests
    for test_function in [
        test_guess_stream_info,
        test_convert_local,
@@ -183,17 +151,4 @@ if __name__ == "__main__":
            )
            test_function(test_vector)
            print("OK")
-
-    # Data URI tests
-    for test_function in [
-        test_convert_with_data_uris,
-        test_convert_stream_with_data_uris,
-    ]:
-        for test_vector in DATA_URI_TEST_VECTORS:
-            print(
-                f"Running {test_function.__name__} on {test_vector.filename}...", end=""
-            )
-            test_function(test_vector)
-            print("OK")
-
    print("All tests passed!")