Bumping version to 0.1.0a2

2025-03-12 11:42:00 -07:00
26 changed files with 111 additions and 737 deletions
--- a/README.md
+++ b/README.md
@@ -6,8 +6,7 @@
 > [!IMPORTANT]
 > Breaking changes between 0.0.1 to 0.1.0:
-> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior. 
+> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior. 
 > * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
 > * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
 MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
@@ -15,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
 At present, MarkItDown supports:
 - PDF
- PowerPoint
+- PowerPoint (reading in top-to-bottom, left-to-right order)
 - Word
 - Excel
 - Images (EXIF metadata and OCR)
@@ -24,7 +23,6 @@ At present, MarkItDown supports:
 - Text-based formats (CSV, JSON, XML)
 - ZIP files (iterates over contents)
 - Youtube URLs
 - EPubs
 - ... and more!
 ## Why Markdown?
@@ -38,7 +36,7 @@ are also highly token-efficient.
 ## Installation
-To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
+To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
 ```bash
 git clone git@github.com:microsoft/markitdown.git
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
  "beautifulsoup4",
  "requests",
  "markdownify",
-  "magika~=0.6.1",
+  "magika>=0.6.0rc1",
  "charset-normalizer",
 ]
@@ -42,7 +42,7 @@ all = [
  "olefile",
  "pydub",
  "SpeechRecognition",
-  "youtube-transcript-api~=1.0.0",
+  "youtube-transcript-api",
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.1"
+__version__ = "0.1.0a2"
--- a/packages/markitdown/src/markitdown/main.py
+++ b/packages/markitdown/src/markitdown/main.py
@@ -4,7 +4,6 @@
 import argparse
 import sys
 import codecs
 import locale
 from textwrap import dedent
 from importlib.metadata import entry_points
 from .__about__ import __version__
@@ -105,12 +104,6 @@ def main():
        help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
    )
    parser.add_argument(
        "--keep-data-uris",
        action="store_true",
        help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
    )
    parser.add_argument("filename", nargs="?")
    args = parser.parse_args()
@@ -146,7 +139,7 @@ def main():
        else:
            charset_hint = None
-    stream_info = None
+    stream_info: str | None = None
    if (
        extension_hint is not None
        or mime_type_hint is not None
@@ -188,15 +181,9 @@ def main():
        markitdown = MarkItDown(enable_plugins=args.use_plugins)
    if args.filename is None:
-        result = markitdown.convert_stream(
+        result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
            sys.stdin.buffer,
            stream_info=stream_info,
            keep_data_uris=args.keep_data_uris,
        )
    else:
-        result = markitdown.convert(
+        result = markitdown.convert(args.filename, stream_info=stream_info)
            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
        )
    _handle_output(args, result)
@@ -205,14 +192,9 @@ def _handle_output(args, result: DocumentConverterResult):
    """Handle output to stdout or file"""
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
-            f.write(result.markdown)
+            f.write(result.text_content)
    else:
-        # Handle stdout encoding errors more gracefully
+        print(result.text_content)
        print(
            result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
                sys.stdout.encoding
            )
        )
 def _exit_with_error(message: str):
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -20,7 +20,6 @@ import charset_normalizer
 import codecs
 from ._stream_info import StreamInfo
 from ._uri_utils import parse_data_uri, file_uri_to_path
 from .converters import (
    PlainTextConverter,
@@ -39,7 +38,6 @@ from .converters import (
    AudioConverter,
    OutlookMsgConverter,
    ZipConverter,
    EpubConverter,
    DocumentIntelligenceConverter,
 )
@@ -193,7 +191,6 @@ class MarkItDown:
            self.register_converter(IpynbConverter())
            self.register_converter(PdfConverter())
            self.register_converter(OutlookMsgConverter())
            self.register_converter(EpubConverter())
            # Register Document Intelligence converter at the top of the stack if endpoint is provided
            docintel_endpoint = kwargs.get("docintel_endpoint")
@@ -243,10 +240,9 @@ class MarkItDown:
        # Local path or url
        if isinstance(source, str):
            if (
-                source.startswith("http:")
+                source.startswith("http://")
-                or source.startswith("https:")
+                or source.startswith("https://")
-                or source.startswith("file:")
+                or source.startswith("file://")
                or source.startswith("data:")
            ):
                # Rename the url argument to mock_url
                # (Deprecated -- use stream_info)
@@ -255,7 +251,7 @@ class MarkItDown:
                    _kwargs["mock_url"] = _kwargs["url"]
                    del _kwargs["url"]
-                return self.convert_uri(source, stream_info=stream_info, **_kwargs)
+                return self.convert_url(source, stream_info=stream_info, **_kwargs)
            else:
                return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Path object
@@ -365,80 +361,22 @@ class MarkItDown:
        url: str,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,
        mock_url: Optional[str] = None,
        **kwargs: Any,
    ) -> DocumentConverterResult:
        """Alias for convert_uri()"""
        # convert_url will likely be deprecated in the future in favor of convert_uri
        return self.convert_uri(
            url,
            stream_info=stream_info,
            file_extension=file_extension,
            mock_url=mock_url,
            **kwargs,
        )
    def convert_uri(
        self,
        uri: str,
        *,
        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        mock_url: Optional[
            str
        ] = None,  # Mock the request as if it came from a different URL
        **kwargs: Any,
-    ) -> DocumentConverterResult:
+    ) -> DocumentConverterResult:  # TODO: fix kwargs type
-        uri = uri.strip()
+        # Send a HTTP request to the URL
-
+        response = self._requests_session.get(url, stream=True)
-        # File URIs
+        response.raise_for_status()
-        if uri.startswith("file:"):
+        return self.convert_response(
-            netloc, path = file_uri_to_path(uri)
+            response,
-            if netloc and netloc != "localhost":
+            stream_info=stream_info,
-                raise ValueError(
+            file_extension=file_extension,
-                    f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
+            url=mock_url,
-                )
+            **kwargs,
-            return self.convert_local(
+        )
                path,
                stream_info=stream_info,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        # Data URIs
        elif uri.startswith("data:"):
            mimetype, attributes, data = parse_data_uri(uri)
            base_guess = StreamInfo(
                mimetype=mimetype,
                charset=attributes.get("charset"),
            )
            if stream_info is not None:
                base_guess = base_guess.copy_and_update(stream_info)
            return self.convert_stream(
                io.BytesIO(data),
                stream_info=base_guess,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        # HTTP/HTTPS URIs
        elif uri.startswith("http:") or uri.startswith("https:"):
            response = self._requests_session.get(uri, stream=True)
            response.raise_for_status()
            return self.convert_response(
                response,
                stream_info=stream_info,
                file_extension=file_extension,
                url=mock_url,
                **kwargs,
            )
        else:
            raise ValueError(
                f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
            )
    def convert_response(
        self,
@@ -672,16 +610,14 @@ class MarkItDown:
        # Call magika to guess from the stream
        cur_pos = file_stream.tell()
        try:
-            result = self._magika.identify_stream(file_stream)
+            stream_bytes = file_stream.read()
            result = self._magika.identify_bytes(stream_bytes)
            if result.status == "ok" and result.prediction.output.label != "unknown":
                # If it's text, also guess the charset
                charset = None
                if result.prediction.output.is_text:
-                    # Read the first 4k to guess the charset
+                    charset_result = charset_normalizer.from_bytes(stream_bytes).best()
                    file_stream.seek(cur_pos)
                    stream_page = file_stream.read(4096)
                    charset_result = charset_normalizer.from_bytes(stream_page).best()
                    if charset_result is not None:
                        charset = self._normalize_charset(charset_result.encoding)
--- a/packages/markitdown/src/markitdown/_uri_utils.py
+++ b/packages/markitdown/src/markitdown/_uri_utils.py
@@ -1,52 +0,0 @@
 import base64
 import os
 from typing import Tuple, Dict
 from urllib.request import url2pathname
 from urllib.parse import urlparse, unquote_to_bytes
 def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
    """Convert a file URI to a local file path"""
    parsed = urlparse(file_uri)
    if parsed.scheme != "file":
        raise ValueError(f"Not a file URL: {file_uri}")
    netloc = parsed.netloc if parsed.netloc else None
    path = os.path.abspath(url2pathname(parsed.path))
    return netloc, path
 def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
    if not uri.startswith("data:"):
        raise ValueError("Not a data URI")
    header, _, data = uri.partition(",")
    if not _:
        raise ValueError("Malformed data URI, missing ',' separator")
    meta = header[5:]  # Strip 'data:'
    parts = meta.split(";")
    is_base64 = False
    # Ends with base64?
    if parts[-1] == "base64":
        parts.pop()
        is_base64 = True
    mime_type = None  # Normally this would default to text/plain but we won't assume
    if len(parts) and len(parts[0]) > 0:
        # First part is the mime type
        mime_type = parts.pop(0)
    attributes: Dict[str, str] = {}
    for part in parts:
        # Handle key=value pairs in the middle
        if "=" in part:
            key, value = part.split("=", 1)
            attributes[key] = value
        elif len(part) > 0:
            attributes[part] = ""
    content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
    return mime_type, attributes, content
--- a/packages/markitdown/src/markitdown/converters/init.py
+++ b/packages/markitdown/src/markitdown/converters/init.py
@@ -18,7 +18,6 @@ from ._audio_converter import AudioConverter
 from ._outlook_msg_converter import OutlookMsgConverter
 from ._zip_converter import ZipConverter
 from ._doc_intel_converter import DocumentIntelligenceConverter
 from ._epub_converter import EpubConverter
 __all__ = [
    "PlainTextConverter",
@@ -38,5 +37,4 @@ __all__ = [
    "OutlookMsgConverter",
    "ZipConverter",
    "DocumentIntelligenceConverter",
    "EpubConverter",
 ]
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -1,7 +1,6 @@
 import io
 import re
 import base64
 import binascii
 from urllib.parse import parse_qs, urlparse
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
@@ -61,8 +60,6 @@ class BingSerpConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        assert stream_info.url is not None
        # Parse the query parameters
        parsed_params = parse_qs(urlparse(stream_info.url).query)
        query = parsed_params.get("q", [""])[0]
@@ -79,12 +76,9 @@ class BingSerpConverter(DocumentConverter):
            slug.extract()
        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify(**kwargs)
+        _markdownify = _CustomMarkdownify()
        results = list()
        for result in soup.find_all(class_="b_algo"):
            if not hasattr(result, "find_all"):
                continue
            # Rewrite redirect urls
            for a in result.find_all("a", href=True):
                parsed_href = urlparse(a["href"])
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):
        style_map = kwargs.get("style_map", None)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
+            mammoth.convert_to_html(file_stream, style_map=style_map).value
        )
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,147 +0,0 @@
 import os
 import zipfile
 import xml.dom.minidom as minidom
 from typing import BinaryIO, Any, Dict, List
 from ._html_converter import HtmlConverter
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "application/epub",
    "application/epub+zip",
    "application/x-epub+zip",
 ]
 ACCEPTED_FILE_EXTENSIONS = [".epub"]
 MIME_TYPE_MAPPING = {
    ".html": "text/html",
    ".xhtml": "application/xhtml+xml",
 }
 class EpubConverter(HtmlConverter):
    """
    Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
    """
    def __init__(self):
        super().__init__()
        self._html_converter = HtmlConverter()
    def accepts(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> bool:
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
                return True
        return False
    def convert(
        self,
        file_stream: BinaryIO,
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        with zipfile.ZipFile(file_stream, "r") as z:
            # Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
            # Locate content.opf
            container_dom = minidom.parse(z.open("META-INF/container.xml"))
            opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
                "full-path"
            )
            # Parse content.opf
            opf_dom = minidom.parse(z.open(opf_path))
            metadata: Dict[str, Any] = {
                "title": self._get_text_from_node(opf_dom, "dc:title"),
                "authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
                "language": self._get_text_from_node(opf_dom, "dc:language"),
                "publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
                "date": self._get_text_from_node(opf_dom, "dc:date"),
                "description": self._get_text_from_node(opf_dom, "dc:description"),
                "identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
            }
            # Extract manifest items (ID → href mapping)
            manifest = {
                item.getAttribute("id"): item.getAttribute("href")
                for item in opf_dom.getElementsByTagName("item")
            }
            # Extract spine order (ID refs)
            spine_items = opf_dom.getElementsByTagName("itemref")
            spine_order = [item.getAttribute("idref") for item in spine_items]
            # Convert spine order to actual file paths
            base_path = "/".join(
                opf_path.split("/")[:-1]
            )  # Get base directory of content.opf
            spine = [
                f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
                for item_id in spine_order
                if item_id in manifest
            ]
            # Extract and convert the content
            markdown_content: List[str] = []
            for file in spine:
                if file in z.namelist():
                    with z.open(file) as f:
                        filename = os.path.basename(file)
                        extension = os.path.splitext(filename)[1].lower()
                        mimetype = MIME_TYPE_MAPPING.get(extension)
                        converted_content = self._html_converter.convert(
                            f,
                            StreamInfo(
                                mimetype=mimetype,
                                extension=extension,
                                filename=filename,
                            ),
                        )
                        markdown_content.append(converted_content.markdown.strip())
            # Format and add the metadata
            metadata_markdown = []
            for key, value in metadata.items():
                if isinstance(value, list):
                    value = ", ".join(value)
                if value:
                    metadata_markdown.append(f"**{key.capitalize()}:** {value}")
            markdown_content.insert(0, "\n".join(metadata_markdown))
            return DocumentConverterResult(
                markdown="\n\n".join(markdown_content), title=metadata["title"]
            )
    def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
        """Convenience function to extract a single occurrence of a tag (e.g., title)."""
        texts = self._get_all_texts_from_nodes(dom, tag_name)
        if len(texts) > 0:
            return texts[0]
        else:
            return None
    def _get_all_texts_from_nodes(
        self, dom: minidom.Document, tag_name: str
    ) -> List[str]:
        """Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
        texts: List[str] = []
        for node in dom.getElementsByTagName(tag_name):
            if node.firstChild and hasattr(node.firstChild, "nodeValue"):
                texts.append(node.firstChild.nodeValue.strip())
        return texts
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+            webpage_text = _CustomMarkdownify().convert_soup(soup)
        assert isinstance(webpage_text, str)
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -17,7 +17,6 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
        options["keep_data_uris"] = options.get("keep_data_uris", False)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)
@@ -102,7 +101,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            return alt
        # Remove dataURIs
-        if src.startswith("data:") and not self.options["keep_data_uris"]:
+        if src.startswith("data:"):
            src = src.split(",")[0] + "..."
        return "![%s](%s%s)" % (alt, src, title_part)
--- a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -9,7 +9,7 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 _dependency_exc_info = None
 olefile = None
 try:
-    import olefile  # type: ignore[no-redef]
+    import olefile
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
@@ -56,13 +56,12 @@ class OutlookMsgConverter(DocumentConverter):
        # Brue force, check if it's an Outlook file
        try:
-            if olefile is not None:
+            msg = olefile.OleFileIO(file_stream)
-                msg = olefile.OleFileIO(file_stream)
+            toc = "\n".join([str(stream) for stream in msg.listdir()])
-                toc = "\n".join([str(stream) for stream in msg.listdir()])
+            return (
-                return (
+                "__properties_version1.0" in toc
-                    "__properties_version1.0" in toc
+                and "__recip_version1.0_#00000000" in toc
-                    and "__recip_version1.0_#00000000" in toc
+            )
                )
        except Exception as e:
            pass
        finally:
@@ -90,11 +89,7 @@ class OutlookMsgConverter(DocumentConverter):
                _dependency_exc_info[2]
            )
        assert (
            olefile is not None
        )  # If we made it this far, olefile should be available
        msg = olefile.OleFileIO(file_stream)
        # Extract email metadata
        md_content = "# Email Message\n\n"
@@ -126,7 +121,6 @@ class OutlookMsgConverter(DocumentConverter):
    def _get_stream_data(self, msg: Any, stream_path: str) -> Union[str, None]:
        """Helper to safely extract and decode stream data from the MSG file."""
        assert olefile is not None
        assert isinstance(
            msg, olefile.OleFileIO
        )  # Ensure msg is of the correct type (type hinting is not possible with the optional olefile package)
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -17,16 +17,12 @@ except ImportError:
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/",
    "application/json",
    "application/markdown",
 ]
-ACCEPTED_FILE_EXTENSIONS = [
+# Mimetypes to ignore (commonly confused extensions)
-    ".txt",
+IGNORE_MIME_TYPE_PREFIXES = [
-    ".text",
+    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
-    ".md",
+    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
    ".markdown",
    ".json",
    ".jsonl",
 ]
@@ -42,14 +38,9 @@ class PlainTextConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()
-        # If we have a charset, we can safely assume it's text
+        for prefix in IGNORE_MIME_TYPE_PREFIXES:
-        # With Magika in the earlier stages, this handles most cases
+            if mimetype.startswith(prefix):
-        if stream_info.charset is not None:
+                return False
            return True
        # Otherwise, check the mimetype and extension
        if extension in ACCEPTED_FILE_EXTENSIONS:
            return True
        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -140,20 +140,13 @@ class PptxConverter(DocumentConverter):
                    alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()
-                    # If keep_data_uris is True, use base64 encoding for images
+                    # A placeholder name
-                    if kwargs.get("keep_data_uris", False):
+                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                        blob = shape.image.blob
+                    md_content += "\n![" + alt_text + "](" + filename + ")\n"
                        content_type = shape.image.content_type or "image/png"
                        b64_string = base64.b64encode(blob).decode("utf-8")
                        md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
                    else:
                        # A placeholder name
                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
                        md_content += "\n![" + alt_text + "](" + filename + ")\n"
                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)
+                    md_content += self._convert_table_to_markdown(shape.table)
                # Charts
                if shape.has_chart:
@@ -200,7 +193,7 @@ class PptxConverter(DocumentConverter):
            return True
        return False
-    def _convert_table_to_markdown(self, table, **kwargs):
+    def _convert_table_to_markdown(self, table):
        # Write the table as HTML, then convert it to Markdown
        html_table = "<html><body><table>"
        first_row = True
@@ -215,10 +208,7 @@ class PptxConverter(DocumentConverter):
            first_row = False
        html_table += "</table></body></html>"
-        return (
+        return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
            + "\n"
        )
    def _convert_chart_to_markdown(self, chart):
        try:
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -28,10 +28,6 @@ CANDIDATE_FILE_EXTENSIONS = [
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""
    def __init__(self):
        super().__init__()
        self._kwargs = {}
    def accepts(
        self,
        file_stream: BinaryIO,
@@ -70,7 +66,7 @@ class RssConverter(DocumentConverter):
            file_stream.seek(cur_pos)
        return False
-    def _feed_type(self, doc: Any) -> str | None:
+    def _feed_type(self, doc: Any) -> str:
        if doc.getElementsByTagName("rss"):
            return "rss"
        elif doc.getElementsByTagName("feed"):
@@ -86,7 +82,6 @@ class RssConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
        self._kwargs = kwargs
        doc = minidom.parse(file_stream)
        feed_type = self._feed_type(doc)
@@ -135,10 +130,10 @@ class RssConverter(DocumentConverter):
        Returns None if the feed type is not recognized or something goes wrong.
        """
        root = doc.getElementsByTagName("rss")[0]
-        channel_list = root.getElementsByTagName("channel")
+        channel = root.getElementsByTagName("channel")
-        if not channel_list:
+        if not channel:
-            raise ValueError("No channel found in RSS feed")
+            return None
-        channel = channel_list[0]
+        channel = channel[0]
        channel_title = self._get_data_by_tag_name(channel, "title")
        channel_description = self._get_data_by_tag_name(channel, "description")
        items = channel.getElementsByTagName("item")
@@ -146,6 +141,8 @@ class RssConverter(DocumentConverter):
            md_text = f"# {channel_title}\n"
        if channel_description:
            md_text += f"{channel_description}\n"
        if not items:
            items = []
        for item in items:
            title = self._get_data_by_tag_name(item, "title")
            description = self._get_data_by_tag_name(item, "description")
@@ -171,7 +168,7 @@ class RssConverter(DocumentConverter):
        try:
            # using bs4 because many RSS feeds have HTML-styled content
            soup = BeautifulSoup(content, "html.parser")
-            return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
+            return _CustomMarkdownify().convert_soup(soup)
        except BaseException as _:
            return content
@@ -186,6 +183,5 @@ class RssConverter(DocumentConverter):
            return None
        fc = nodes[0].firstChild
        if fc:
-            if hasattr(fc, "data"):
+            return fc.data
                return fc.data
        return None
--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -7,14 +7,8 @@ from .._exceptions import MissingDependencyException
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
-    # Suppress some warnings on library import
+    import speech_recognition as sr
-    import warnings
+    import pydub
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        warnings.filterwarnings("ignore", category=SyntaxWarning)
        import speech_recognition as sr
        import pydub
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -1,7 +1,7 @@
 import io
 import re
 import bs4
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
@@ -57,7 +57,7 @@ class WikipediaConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
-        soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
        # Remove javascript and style blocks
        for script in soup(["script", "style"]):
@@ -72,15 +72,16 @@ class WikipediaConverter(DocumentConverter):
        if body_elm:
            # What's the title
-            if title_elm and isinstance(title_elm, bs4.Tag):
+            if title_elm and len(title_elm) > 0:
-                main_title = title_elm.string
+                main_title = title_elm.string  # type: ignore
                assert isinstance(main_title, str)
            # Convert the page
-            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
+            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
-                **kwargs
+                body_elm
-            ).convert_soup(body_elm)
+            )
        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+            webpage_text = _CustomMarkdownify().convert_soup(soup)
        return DocumentConverterResult(
            markdown=webpage_text,
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -86,9 +86,7 @@ class XlsxConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(
+                self._html_converter.convert_string(html_content).markdown.strip()
                    html_content, **kwargs
                ).markdown.strip()
                + "\n\n"
            )
@@ -148,9 +146,7 @@ class XlsConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(
+                self._html_converter.convert_string(html_content).markdown.strip()
                    html_content, **kwargs
                ).markdown.strip()
                + "\n\n"
            )
--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -3,22 +3,17 @@ import json
 import time
 import io
 import re
 import bs4
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote
 from bs4 import BeautifulSoup
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from ._markdownify import _CustomMarkdownify
 # Optional YouTube transcription support
 try:
-    # Suppress some warnings on library import
+    from youtube_transcript_api import YouTubeTranscriptApi
    import warnings
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=SyntaxWarning)
        # Patch submitted upstream to fix the SyntaxWarning
        from youtube_transcript_api import YouTubeTranscriptApi
    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 except ModuleNotFoundError:
@@ -77,31 +72,21 @@ class YouTubeConverter(DocumentConverter):
    ) -> DocumentConverterResult:
        # Parse the stream
        encoding = "utf-8" if stream_info.charset is None else stream_info.charset
-        soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
+        soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
        # Read the meta tags
-        metadata: Dict[str, str] = {}
+        metadata: Dict[str, str] = {"title": soup.title.string}
        if soup.title and soup.title.string:
            metadata["title"] = soup.title.string
        for meta in soup(["meta"]):
            if not isinstance(meta, bs4.Tag):
                continue
            for a in meta.attrs:
                if a in ["itemprop", "property", "name"]:
-                    key = str(meta.get(a, ""))
+                    content = meta.get("content", "")
-                    content = str(meta.get("content", ""))
+                    if content:  # Only add non-empty content
-                    if key and content:  # Only add non-empty content
+                        metadata[meta[a]] = content
                        metadata[key] = content
                    break
        # Try reading the description
        try:
            for script in soup(["script"]):
                if not isinstance(script, bs4.Tag):
                    continue
                if not script.string:  # Skip empty scripts
                    continue
                content = script.string
@@ -147,7 +132,6 @@ class YouTubeConverter(DocumentConverter):
            webpage_text += f"\n### Description\n{description}\n"
        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
            ytt_api = YouTubeTranscriptApi()
            transcript_text = ""
            parsed_url = urlparse(stream_info.url)  # type: ignore
            params = parse_qs(parsed_url.query)  # type: ignore
@@ -159,7 +143,7 @@ class YouTubeConverter(DocumentConverter):
                    )
                    # Retry the transcript fetching operation
                    transcript = self._retry_operation(
-                        lambda: ytt_api.fetch(
+                        lambda: YouTubeTranscriptApi.get_transcript(
                            video_id, languages=youtube_transcript_languages
                        ),
                        retries=3,  # Retry 3 times
@@ -167,14 +151,17 @@ class YouTubeConverter(DocumentConverter):
                    )
                    if transcript:
                        transcript_text = " ".join(
-                            [part.text for part in transcript]
+                            [part["text"] for part in transcript]
                        )  # type: ignore
                    # Alternative formatting:
                    # formatter = TextFormatter()
                    # formatter.format_transcript(transcript)
                except Exception as e:
                    print(f"Error fetching transcript: {e}")
            if transcript_text:
                webpage_text += f"\n### Transcript\n{transcript_text}\n"
-        title = title if title else (soup.title.string if soup.title else "")
+        title = title if title else soup.title.string
        assert isinstance(title, str)
        return DocumentConverterResult(
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -25,11 +25,8 @@ GENERAL_TEST_VECTORS = [
            "# Abstract",
            "# Introduction",
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "data:image/png;base64...",
        ],
        must_not_include=[
            "data:image/png;base64,iVBORw0KGgoAAAANSU",
        ],
        must_not_include=[],
    ),
    FileTestVector(
        filename="test.xlsx",
@@ -68,9 +65,8 @@ GENERAL_TEST_VECTORS = [
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
            "2003",  # chart value
            "![This phrase of the caption is Human-written.](Picture4.jpg)",
        ],
-        must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
+        must_not_include=[],
    ),
    FileTestVector(
        filename="test_outlook_msg.msg",
@@ -215,64 +211,4 @@ GENERAL_TEST_VECTORS = [
        ],
        must_not_include=[],
    ),
    FileTestVector(
        filename="test.epub",
        mimetype="application/epub+zip",
        charset=None,
        url=None,
        must_include=[
            "**Authors:** Test Author",
            "A test EPUB document for MarkItDown testing",
            "# Chapter 1: Test Content",
            "This is a **test** paragraph with some formatting",
            "* A bullet point",
            "* Another point",
            "# Chapter 2: More Content",
            "*different* style",
            "> This is a blockquote for testing",
        ],
        must_not_include=[],
    ),
 ]
 DATA_URI_TEST_VECTORS = [
    FileTestVector(
        filename="test.docx",
        mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        charset=None,
        url=None,
        must_include=[
            "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
            "49e168b7-d2ae-407f-a055-2167576f39a1",
            "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
            "# Abstract",
            "# Introduction",
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "data:image/png;base64,iVBORw0KGgoAAAANSU",
        ],
        must_not_include=[
            "data:image/png;base64...",
        ],
    ),
    FileTestVector(
        filename="test.pptx",
        mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
        charset=None,
        url=None,
        must_include=[
            "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
            "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
            "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
            "1b92870d-e3b5-4e65-8153-919f4ff45592",
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
            "2003",  # chart value
            "![This phrase of the caption is Human-written.]",  # image caption
            "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
        ],
        must_not_include=[
            "![This phrase of the caption is Human-written.](Picture4.jpg)",
        ],
    ),
 ]
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@@ -7,17 +7,9 @@ import locale
 from typing import List
 if __name__ == "__main__":
-    from _test_vectors import (
+    from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
        GENERAL_TEST_VECTORS,
        DATA_URI_TEST_VECTORS,
        FileTestVector,
    )
 else:
-    from ._test_vectors import (
+    from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
        GENERAL_TEST_VECTORS,
        DATA_URI_TEST_VECTORS,
        FileTestVector,
    )
 from markitdown import (
    MarkItDown,
@@ -122,9 +114,7 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None:
    )
    stdout = result.stdout.decode(locale.getpreferredencoding())
-    assert (
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
        result.returncode == 0
    ), f"CLI exited with error: {result.stderr.decode('utf-8')}"
    for test_string in test_vector.must_include:
        assert test_string in stdout
    for test_string in test_vector.must_not_include:
@@ -157,39 +147,6 @@ def test_convert_url(shared_tmp_dir, test_vector):
        assert test_string not in stdout
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
 def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
    """Test CLI functionality when keep_data_uris is enabled"""
    output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
    result = subprocess.run(
        [
            "python",
            "-m",
            "markitdown",
            "--keep-data-uris",
            "-o",
            output_file,
            os.path.join(TEST_FILES_DIR, test_vector.filename),
        ],
        capture_output=True,
        text=True,
    )
    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
    assert os.path.exists(output_file), f"Output file not created: {output_file}"
    with open(output_file, "r") as f:
        output_data = f.read()
        for test_string in test_vector.must_include:
            assert test_string in output_data
        for test_string in test_vector.must_not_include:
            assert test_string not in output_data
    os.remove(output_file)
    assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
 if __name__ == "__main__":
    import sys
    import tempfile
@@ -197,7 +154,6 @@ if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    with tempfile.TemporaryDirectory() as tmp_dir:
        # General tests
        for test_function in [
            test_output_to_stdout,
            test_output_to_file,
@@ -211,17 +167,4 @@ if __name__ == "__main__":
                )
                test_function(tmp_dir, test_vector)
                print("OK")
        # Data URI tests
        for test_function in [
            test_output_to_file_with_data_uris,
        ]:
            for test_vector in DATA_URI_TEST_VECTORS:
                print(
                    f"Running {test_function.__name__} on {test_vector.filename}...",
                    end="",
                )
                test_function(tmp_dir, test_vector)
                print("OK")
    print("All tests passed!")
--- a/packages/markitdown/tests/test_files/test.docx
+++ b/packages/markitdown/tests/test_files/test.docx
--- a/packages/markitdown/tests/test_files/test.epub
+++ b/packages/markitdown/tests/test_files/test.epub
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -5,8 +5,6 @@ import shutil
 import openai
 import pytest
 from markitdown._uri_utils import parse_data_uri, file_uri_to_path
 from markitdown import (
    MarkItDown,
    UnsupportedFormatException,
@@ -178,79 +176,6 @@ def test_stream_info_operations() -> None:
    assert updated_stream_info.url == "url.1"
 def test_data_uris() -> None:
    # Test basic parsing of data URIs
    data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type is None
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 1
    assert attributes["charset"] == "utf-8"
    assert data == b"Hello, World!"
    data_uri = "data:,Hello%2C%20World%21"
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type is None
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:text/plain,Hello%2C%20World%21"
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 0
    assert data == b"Hello, World!"
    data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
    mime_type, attributes, data = parse_data_uri(data_uri)
    assert mime_type == "text/plain"
    assert len(attributes) == 1
    assert attributes["charset"] == "utf-8"
    assert data == b"Hello, World!"
 def test_file_uris() -> None:
    # Test file URI with an empty host
    file_uri = "file:///path/to/file.txt"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
    # Test file URI with no host
    file_uri = "file:/path/to/file.txt"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
    # Test file URI with localhost
    file_uri = "file://localhost/path/to/file.txt"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc == "localhost"
    assert path == "/path/to/file.txt"
    # Test file URI with query parameters
    file_uri = "file:///path/to/file.txt?param=value"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
    # Test file URI with fragment
    file_uri = "file:///path/to/file.txt#fragment"
    netloc, path = file_uri_to_path(file_uri)
    assert netloc is None
    assert path == "/path/to/file.txt"
 def test_docx_comments() -> None:
    markitdown = MarkItDown()
@@ -389,8 +314,6 @@ if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    for test in [
        test_stream_info_operations,
        test_data_uris,
        test_file_uris,
        test_docx_comments,
        test_input_as_strings,
        test_markitdown_remote,
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@@ -3,14 +3,12 @@ import os
 import time
 import pytest
 import codecs
 import base64
 from pathlib import Path
 if __name__ == "__main__":
-    from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
+    from _test_vectors import GENERAL_TEST_VECTORS
 else:
-    from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
+    from ._test_vectors import GENERAL_TEST_VECTORS
 from markitdown import (
    MarkItDown,
@@ -49,6 +47,7 @@ def test_guess_stream_info(test_vector):
        # mimetype or extension, so we'll special-case them here.
        if test_vector.filename in [
            "test_outlook_msg.msg",
            "test_mskanji.csv",  # See: https://github.com/google/magika/issues/983
        ]:
            return
@@ -97,6 +96,15 @@ def test_convert_stream_without_hints(test_vector):
    """Test the conversion of a stream with no stream info."""
    markitdown = MarkItDown()
    # For some limited exceptions, we can't guarantee the exact
    # mimetype or extension, so we'll special-case them here.
    if test_vector.filename in [
        # This appears to be a subtle bug in magika.
        # See: https://github.com/google/magika/issues/983
        "test_mskanji.csv",
    ]:
        return
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        result = markitdown.convert(stream, url=test_vector.url)
        for string in test_vector.must_include:
@@ -110,8 +118,8 @@ def test_convert_stream_without_hints(test_vector):
    reason="do not run tests that query external urls",
 )
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
-def test_convert_http_uri(test_vector):
+def test_convert_url(test_vector):
-    """Test the conversion of an HTTP:// or HTTPS:// URI."""
+    """Test the conversion of a stream with no stream info."""
    markitdown = MarkItDown()
    time.sleep(1)  # Ensure we don't hit rate limits
@@ -126,96 +134,16 @@ def test_convert_http_uri(test_vector):
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 def test_convert_file_uri(test_vector):
    """Test the conversion of a file:// URI."""
    markitdown = MarkItDown()
    result = markitdown.convert(
        Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
        url=test_vector.url,
    )
    for string in test_vector.must_include:
        assert string in result.markdown
    for string in test_vector.must_not_include:
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
 def test_convert_data_uri(test_vector):
    """Test the conversion of a data URI."""
    markitdown = MarkItDown()
    data = ""
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        data = base64.b64encode(stream.read()).decode("utf-8")
    mimetype = test_vector.mimetype
    data_uri = f"data:{mimetype};base64,{data}"
    result = markitdown.convert(
        data_uri,
        url=test_vector.url,
    )
    for string in test_vector.must_include:
        assert string in result.markdown
    for string in test_vector.must_not_include:
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
 def test_convert_keep_data_uris(test_vector):
    """Test API functionality when keep_data_uris is enabled"""
    markitdown = MarkItDown()
    # Test local file conversion
    result = markitdown.convert(
        os.path.join(TEST_FILES_DIR, test_vector.filename),
        keep_data_uris=True,
        url=test_vector.url,
    )
    for string in test_vector.must_include:
        assert string in result.markdown
    for string in test_vector.must_not_include:
        assert string not in result.markdown
@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
 def test_convert_stream_keep_data_uris(test_vector):
    """Test the conversion of a stream with no stream info."""
    markitdown = MarkItDown()
    stream_info = StreamInfo(
        extension=os.path.splitext(test_vector.filename)[1],
        mimetype=test_vector.mimetype,
        charset=test_vector.charset,
    )
    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
        result = markitdown.convert(
            stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
        )
        for string in test_vector.must_include:
            assert string in result.markdown
        for string in test_vector.must_not_include:
            assert string not in result.markdown
 if __name__ == "__main__":
    import sys
    """Runs this file's tests from the command line."""
    # General tests
    for test_function in [
        test_guess_stream_info,
        test_convert_local,
        test_convert_stream_with_hints,
        test_convert_stream_without_hints,
-        test_convert_http_uri,
+        test_convert_url,
        test_convert_file_uri,
        test_convert_data_uri,
    ]:
        for test_vector in GENERAL_TEST_VECTORS:
            print(
@@ -223,17 +151,4 @@ if __name__ == "__main__":
            )
            test_function(test_vector)
            print("OK")
    # Data URI tests
    for test_function in [
        test_convert_keep_data_uris,
        test_convert_stream_keep_data_uris,
    ]:
        for test_vector in DATA_URI_TEST_VECTORS:
            print(
                f"Running {test_function.__name__} on {test_vector.filename}...", end=""
            )
            test_function(test_vector)
            print("OK")
    print("All tests passed!")