Bump version. (#1154 )

convert_url renamed to convert_uri, and now handles data and file URIs (#1153 )
Bump version. (#1150 )
2025-03-24 23:26:30 -07:00 · 2025-03-24 21:43:04 -07:00 · 2025-03-22 11:21:32 -07:00 · 2025-03-21 09:27:25 -07:00 · 2025-03-20 18:50:23 -07:00 · 2025-03-20 12:25:56 -07:00
22 changed files with 507 additions and 83 deletions
--- a/README.md
+++ b/README.md
@@ -6,7 +6,8 @@

 > [!IMPORTANT]
 > Breaking changes between 0.0.1 to 0.1.0:
-> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]~=0.1.0a1'` to have backward-compatible behavior. 
+> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior. 
+> * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
 > * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.

 MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
@@ -37,7 +38,7 @@ are also highly token-efficient.

 ## Installation

-To install MarkItDown, use pip: `pip install 'markitdown[all]~=0.1.0a1'`. Alternatively, you can install it from the source:
+To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:

 ```bash
 git clone git@github.com:microsoft/markitdown.git
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
  "beautifulsoup4",
  "requests",
  "markdownify",
-  "magika>=0.6.1rc3",
+  "magika~=0.6.1",
  "charset-normalizer",
 ]

@@ -42,7 +42,7 @@ all = [
  "olefile",
  "pydub",
  "SpeechRecognition",
-  "youtube-transcript-api",
+  "youtube-transcript-api~=1.0.0",
  "azure-ai-documentintelligence",
  "azure-identity"
 ]
--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a4"
+__version__ = "0.1.1"
--- a/packages/markitdown/src/markitdown/main.py
+++ b/packages/markitdown/src/markitdown/main.py
@@ -4,6 +4,7 @@
 import argparse
 import sys
 import codecs
+import locale
 from textwrap import dedent
 from importlib.metadata import entry_points
 from .__about__ import __version__
@@ -104,6 +105,12 @@ def main():
        help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
    )

+    parser.add_argument(
+        "--keep-data-uris",
+        action="store_true",
+        help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
+    )
+
    parser.add_argument("filename", nargs="?")
    args = parser.parse_args()

@@ -181,9 +188,15 @@ def main():
        markitdown = MarkItDown(enable_plugins=args.use_plugins)

    if args.filename is None:
-        result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
+        result = markitdown.convert_stream(
+            sys.stdin.buffer,
+            stream_info=stream_info,
+            keep_data_uris=args.keep_data_uris,
+        )
    else:
-        result = markitdown.convert(args.filename, stream_info=stream_info)
+        result = markitdown.convert(
+            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+        )

    _handle_output(args, result)

@@ -192,9 +205,14 @@ def _handle_output(args, result: DocumentConverterResult):
    """Handle output to stdout or file"""
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
-            f.write(result.text_content)
+            f.write(result.markdown)
    else:
-        print(result.text_content)
+        # Handle stdout encoding errors more gracefully
+        print(
+            result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
+                sys.stdout.encoding
+            )
+        )


 def _exit_with_error(message: str):
--- a/packages/markitdown/src/markitdown/_markitdown.py
+++ b/packages/markitdown/src/markitdown/_markitdown.py
@@ -20,6 +20,7 @@ import charset_normalizer
 import codecs

 from ._stream_info import StreamInfo
+from ._uri_utils import parse_data_uri, file_uri_to_path

 from .converters import (
    PlainTextConverter,
@@ -242,9 +243,10 @@ class MarkItDown:
        # Local path or url
        if isinstance(source, str):
            if (
-                source.startswith("http://")
-                or source.startswith("https://")
-                or source.startswith("file://")
+                source.startswith("http:")
+                or source.startswith("https:")
+                or source.startswith("file:")
+                or source.startswith("data:")
            ):
                # Rename the url argument to mock_url
                # (Deprecated -- use stream_info)
@@ -253,7 +255,7 @@ class MarkItDown:
                    _kwargs["mock_url"] = _kwargs["url"]
                    del _kwargs["url"]

-                return self.convert_url(source, stream_info=stream_info, **_kwargs)
+                return self.convert_uri(source, stream_info=stream_info, **_kwargs)
            else:
                return self.convert_local(source, stream_info=stream_info, **kwargs)
        # Path object
@@ -363,22 +365,80 @@ class MarkItDown:
        url: str,
        *,
        stream_info: Optional[StreamInfo] = None,
+        file_extension: Optional[str] = None,
+        mock_url: Optional[str] = None,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        """Alias for convert_uri()"""
+        # convert_url will likely be deprecated in the future in favor of convert_uri
+        return self.convert_uri(
+            url,
+            stream_info=stream_info,
+            file_extension=file_extension,
+            mock_url=mock_url,
+            **kwargs,
+        )
+
+    def convert_uri(
+        self,
+        uri: str,
+        *,
+        stream_info: Optional[StreamInfo] = None,
        file_extension: Optional[str] = None,  # Deprecated -- use stream_info
        mock_url: Optional[
            str
        ] = None,  # Mock the request as if it came from a different URL
        **kwargs: Any,
-    ) -> DocumentConverterResult:  # TODO: fix kwargs type
-        # Send a HTTP request to the URL
-        response = self._requests_session.get(url, stream=True)
-        response.raise_for_status()
-        return self.convert_response(
-            response,
-            stream_info=stream_info,
-            file_extension=file_extension,
-            url=mock_url,
-            **kwargs,
-        )
+    ) -> DocumentConverterResult:
+        uri = uri.strip()
+
+        # File URIs
+        if uri.startswith("file:"):
+            netloc, path = file_uri_to_path(uri)
+            if netloc and netloc != "localhost":
+                raise ValueError(
+                    f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
+                )
+            return self.convert_local(
+                path,
+                stream_info=stream_info,
+                file_extension=file_extension,
+                url=mock_url,
+                **kwargs,
+            )
+        # Data URIs
+        elif uri.startswith("data:"):
+            mimetype, attributes, data = parse_data_uri(uri)
+
+            base_guess = StreamInfo(
+                mimetype=mimetype,
+                charset=attributes.get("charset"),
+            )
+            if stream_info is not None:
+                base_guess = base_guess.copy_and_update(stream_info)
+
+            return self.convert_stream(
+                io.BytesIO(data),
+                stream_info=base_guess,
+                file_extension=file_extension,
+                url=mock_url,
+                **kwargs,
+            )
+        # HTTP/HTTPS URIs
+        elif uri.startswith("http:") or uri.startswith("https:"):
+            response = self._requests_session.get(uri, stream=True)
+            response.raise_for_status()
+            return self.convert_response(
+                response,
+                stream_info=stream_info,
+                file_extension=file_extension,
+                url=mock_url,
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
+            )

    def convert_response(
        self,
--- a/packages/markitdown/src/markitdown/_uri_utils.py
+++ b/packages/markitdown/src/markitdown/_uri_utils.py
@@ -0,0 +1,52 @@
+import base64
+import os
+from typing import Tuple, Dict
+from urllib.request import url2pathname
+from urllib.parse import urlparse, unquote_to_bytes
+
+
+def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
+    """Convert a file URI to a local file path"""
+    parsed = urlparse(file_uri)
+    if parsed.scheme != "file":
+        raise ValueError(f"Not a file URL: {file_uri}")
+
+    netloc = parsed.netloc if parsed.netloc else None
+    path = os.path.abspath(url2pathname(parsed.path))
+    return netloc, path
+
+
+def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
+    if not uri.startswith("data:"):
+        raise ValueError("Not a data URI")
+
+    header, _, data = uri.partition(",")
+    if not _:
+        raise ValueError("Malformed data URI, missing ',' separator")
+
+    meta = header[5:]  # Strip 'data:'
+    parts = meta.split(";")
+
+    is_base64 = False
+    # Ends with base64?
+    if parts[-1] == "base64":
+        parts.pop()
+        is_base64 = True
+
+    mime_type = None  # Normally this would default to text/plain but we won't assume
+    if len(parts) and len(parts[0]) > 0:
+        # First part is the mime type
+        mime_type = parts.pop(0)
+
+    attributes: Dict[str, str] = {}
+    for part in parts:
+        # Handle key=value pairs in the middle
+        if "=" in part:
+            key, value = part.split("=", 1)
+            attributes[key] = value
+        elif len(part) > 0:
+            attributes[part] = ""
+
+    content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
+
+    return mime_type, attributes, content
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -79,7 +79,7 @@ class BingSerpConverter(DocumentConverter):
            slug.extract()

        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify()
+        _markdownify = _CustomMarkdownify(**kwargs)
        results = list()
        for result in soup.find_all(class_="b_algo"):
            if not hasattr(result, "find_all"):
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):

        style_map = kwargs.get("style_map", None)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(file_stream, style_map=style_map).value
+            mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
        )
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
-            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)

        assert isinstance(webpage_text, str)

--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+        options["keep_data_uris"] = options.get("keep_data_uris", False)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)

@@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            return alt

        # Remove dataURIs
-        if src.startswith("data:"):
+        if src.startswith("data:") and not self.options["keep_data_uris"]:
            src = src.split(",")[0] + "..."

        return "![%s](%s%s)" % (alt, src, title_part)
--- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -17,12 +17,16 @@ except ImportError:
 ACCEPTED_MIME_TYPE_PREFIXES = [
    "text/",
    "application/json",
+    "application/markdown",
 ]

-# Mimetypes to ignore (commonly confused extensions)
-IGNORE_MIME_TYPE_PREFIXES = [
-    "text/vnd.in3d.spot",  # .spo wich is confused with xls, doc, etc.
-    "text/vnd.graphviz",  # .dot which is confused with xls, doc, etc.
+ACCEPTED_FILE_EXTENSIONS = [
+    ".txt",
+    ".text",
+    ".md",
+    ".markdown",
+    ".json",
+    ".jsonl",
 ]


@@ -38,9 +42,14 @@ class PlainTextConverter(DocumentConverter):
        mimetype = (stream_info.mimetype or "").lower()
        extension = (stream_info.extension or "").lower()

-        for prefix in IGNORE_MIME_TYPE_PREFIXES:
-            if mimetype.startswith(prefix):
-                return False
+        # If we have a charset, we can safely assume it's text
+        # With Magika in the earlier stages, this handles most cases
+        if stream_info.charset is not None:
+            return True
+
+        # Otherwise, check the mimetype and extension
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True

        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
            if mimetype.startswith(prefix):
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -140,13 +140,20 @@ class PptxConverter(DocumentConverter):
                    alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()

-                    # A placeholder name
-                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                    md_content += "\n![" + alt_text + "](" + filename + ")\n"
+                    # If keep_data_uris is True, use base64 encoding for images
+                    if kwargs.get("keep_data_uris", False):
+                        blob = shape.image.blob
+                        content_type = shape.image.content_type or "image/png"
+                        b64_string = base64.b64encode(blob).decode("utf-8")
+                        md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
+                    else:
+                        # A placeholder name
+                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                        md_content += "\n![" + alt_text + "](" + filename + ")\n"

                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table)
+                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)

                # Charts
                if shape.has_chart:
@@ -193,7 +200,7 @@ class PptxConverter(DocumentConverter):
            return True
        return False

-    def _convert_table_to_markdown(self, table):
+    def _convert_table_to_markdown(self, table, **kwargs):
        # Write the table as HTML, then convert it to Markdown
        html_table = "<html><body><table>"
        first_row = True
@@ -208,7 +215,10 @@ class PptxConverter(DocumentConverter):
            first_row = False
        html_table += "</table></body></html>"

-        return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
+        return (
+            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+            + "\n"
+        )

    def _convert_chart_to_markdown(self, chart):
        try:
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""

+    def __init__(self):
+        super().__init__()
+        self._kwargs = {}
+
    def accepts(
        self,
        file_stream: BinaryIO,
@@ -82,6 +86,7 @@ class RssConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
+        self._kwargs = kwargs
        doc = minidom.parse(file_stream)
        feed_type = self._feed_type(doc)

@@ -166,7 +171,7 @@ class RssConverter(DocumentConverter):
        try:
            # using bs4 because many RSS feeds have HTML-styled content
            soup = BeautifulSoup(content, "html.parser")
-            return _CustomMarkdownify().convert_soup(soup)
+            return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
        except BaseException as _:
            return content

--- a/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
+++ b/packages/markitdown/src/markitdown/converters/_transcribe_audio.py
@@ -7,20 +7,14 @@ from .._exceptions import MissingDependencyException
 # Save reporting of any exceptions for later
 _dependency_exc_info = None
 try:
-    # Suppress some deprecation warnings from the speech_recognition library
+    # Suppress some warnings on library import
    import warnings

-    warnings.filterwarnings(
-        "ignore", category=DeprecationWarning, module="speech_recognition"
-    )
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="pydub",  # TODO: Migrate away from pydub
-    )
-    import speech_recognition as sr
-
-    import pydub
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        import speech_recognition as sr
+        import pydub
 except ImportError:
    # Preserve the error and stack trace for later
    _dependency_exc_info = sys.exc_info()
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -76,11 +76,11 @@ class WikipediaConverter(DocumentConverter):
                main_title = title_elm.string

            # Convert the page
-            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
-                body_elm
-            )
+            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
+                **kwargs
+            ).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)

        return DocumentConverterResult(
            markdown=webpage_text,
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -86,7 +86,9 @@ class XlsxConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(html_content).markdown.strip()
+                self._html_converter.convert_string(
+                    html_content, **kwargs
+                ).markdown.strip()
                + "\n\n"
            )

@@ -146,7 +148,9 @@ class XlsConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(html_content).markdown.strip()
+                self._html_converter.convert_string(
+                    html_content, **kwargs
+                ).markdown.strip()
                + "\n\n"
            )

--- a/packages/markitdown/src/markitdown/converters/_youtube_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -4,22 +4,21 @@ import time
 import io
 import re
 import bs4
-import warnings
 from typing import Any, BinaryIO, Optional, Dict, List, Union
 from urllib.parse import parse_qs, urlparse, unquote

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
-from ._markdownify import _CustomMarkdownify

 # Optional YouTube transcription support
 try:
-    warnings.filterwarnings(
-        "ignore",
-        category=SyntaxWarning,
-        module="youtube_transcript_api",  # Patch submitted to youtube-transcript-api
-    )
-    from youtube_transcript_api import YouTubeTranscriptApi
+    # Suppress some warnings on library import
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=SyntaxWarning)
+        # Patch submitted upstream to fix the SyntaxWarning
+        from youtube_transcript_api import YouTubeTranscriptApi

    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
 except ModuleNotFoundError:
@@ -148,6 +147,7 @@ class YouTubeConverter(DocumentConverter):
            webpage_text += f"\n### Description\n{description}\n"

        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
+            ytt_api = YouTubeTranscriptApi()
            transcript_text = ""
            parsed_url = urlparse(stream_info.url)  # type: ignore
            params = parse_qs(parsed_url.query)  # type: ignore
@@ -159,7 +159,7 @@ class YouTubeConverter(DocumentConverter):
                    )
                    # Retry the transcript fetching operation
                    transcript = self._retry_operation(
-                        lambda: YouTubeTranscriptApi.get_transcript(
+                        lambda: ytt_api.fetch(
                            video_id, languages=youtube_transcript_languages
                        ),
                        retries=3,  # Retry 3 times
@@ -167,11 +167,8 @@ class YouTubeConverter(DocumentConverter):
                    )
                    if transcript:
                        transcript_text = " ".join(
-                            [part["text"] for part in transcript]
+                            [part.text for part in transcript]
                        )  # type: ignore
-                    # Alternative formatting:
-                    # formatter = TextFormatter()
-                    # formatter.format_transcript(transcript)
                except Exception as e:
                    print(f"Error fetching transcript: {e}")
            if transcript_text:
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -25,8 +25,11 @@ GENERAL_TEST_VECTORS = [
            "# Abstract",
            "# Introduction",
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+            "data:image/png;base64...",
+        ],
+        must_not_include=[
+            "data:image/png;base64,iVBORw0KGgoAAAANSU",
        ],
-        must_not_include=[],
    ),
    FileTestVector(
        filename="test.xlsx",
@@ -65,8 +68,9 @@ GENERAL_TEST_VECTORS = [
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
            "2003",  # chart value
+            "![This phrase of the caption is Human-written.](Picture4.jpg)",
        ],
-        must_not_include=[],
+        must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
    ),
    FileTestVector(
        filename="test_outlook_msg.msg",
@@ -230,3 +234,45 @@ GENERAL_TEST_VECTORS = [
        must_not_include=[],
    ),
 ]
+
+
+DATA_URI_TEST_VECTORS = [
+    FileTestVector(
+        filename="test.docx",
+        mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        charset=None,
+        url=None,
+        must_include=[
+            "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+            "49e168b7-d2ae-407f-a055-2167576f39a1",
+            "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+            "# Abstract",
+            "# Introduction",
+            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+            "data:image/png;base64,iVBORw0KGgoAAAANSU",
+        ],
+        must_not_include=[
+            "data:image/png;base64...",
+        ],
+    ),
+    FileTestVector(
+        filename="test.pptx",
+        mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        charset=None,
+        url=None,
+        must_include=[
+            "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
+            "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
+            "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
+            "1b92870d-e3b5-4e65-8153-919f4ff45592",
+            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
+            "2003",  # chart value
+            "![This phrase of the caption is Human-written.]",  # image caption
+            "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
+        ],
+        must_not_include=[
+            "![This phrase of the caption is Human-written.](Picture4.jpg)",
+        ],
+    ),
+]
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@@ -7,9 +7,17 @@ import locale
 from typing import List

 if __name__ == "__main__":
-    from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+    from _test_vectors import (
+        GENERAL_TEST_VECTORS,
+        DATA_URI_TEST_VECTORS,
+        FileTestVector,
+    )
 else:
-    from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+    from ._test_vectors import (
+        GENERAL_TEST_VECTORS,
+        DATA_URI_TEST_VECTORS,
+        FileTestVector,
+    )

 from markitdown import (
    MarkItDown,
@@ -149,6 +157,39 @@ def test_convert_url(shared_tmp_dir, test_vector):
        assert test_string not in stdout


+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
+    """Test CLI functionality when keep_data_uris is enabled"""
+
+    output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "markitdown",
+            "--keep-data-uris",
+            "-o",
+            output_file,
+            os.path.join(TEST_FILES_DIR, test_vector.filename),
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+    assert os.path.exists(output_file), f"Output file not created: {output_file}"
+
+    with open(output_file, "r") as f:
+        output_data = f.read()
+        for test_string in test_vector.must_include:
+            assert test_string in output_data
+        for test_string in test_vector.must_not_include:
+            assert test_string not in output_data
+
+    os.remove(output_file)
+    assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
+
+
 if __name__ == "__main__":
    import sys
    import tempfile
@@ -156,6 +197,7 @@ if __name__ == "__main__":
    """Runs this file's tests from the command line."""

    with tempfile.TemporaryDirectory() as tmp_dir:
+        # General tests
        for test_function in [
            test_output_to_stdout,
            test_output_to_file,
@@ -169,4 +211,17 @@ if __name__ == "__main__":
                )
                test_function(tmp_dir, test_vector)
                print("OK")
+
+        # Data URI tests
+        for test_function in [
+            test_output_to_file_with_data_uris,
+        ]:
+            for test_vector in DATA_URI_TEST_VECTORS:
+                print(
+                    f"Running {test_function.__name__} on {test_vector.filename}...",
+                    end="",
+                )
+                test_function(tmp_dir, test_vector)
+                print("OK")
+
    print("All tests passed!")
--- a/packages/markitdown/tests/test_files/test.docx
+++ b/packages/markitdown/tests/test_files/test.docx
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -5,6 +5,8 @@ import shutil
 import openai
 import pytest

+from markitdown._uri_utils import parse_data_uri, file_uri_to_path
+
 from markitdown import (
    MarkItDown,
    UnsupportedFormatException,
@@ -176,6 +178,79 @@ def test_stream_info_operations() -> None:
    assert updated_stream_info.url == "url.1"


+def test_data_uris() -> None:
+    # Test basic parsing of data URIs
+    data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="
+    mime_type, attributes, data = parse_data_uri(data_uri)
+    assert mime_type == "text/plain"
+    assert len(attributes) == 0
+    assert data == b"Hello, World!"
+
+    data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ=="
+    mime_type, attributes, data = parse_data_uri(data_uri)
+    assert mime_type is None
+    assert len(attributes) == 0
+    assert data == b"Hello, World!"
+
+    data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ=="
+    mime_type, attributes, data = parse_data_uri(data_uri)
+    assert mime_type == "text/plain"
+    assert len(attributes) == 1
+    assert attributes["charset"] == "utf-8"
+    assert data == b"Hello, World!"
+
+    data_uri = "data:,Hello%2C%20World%21"
+    mime_type, attributes, data = parse_data_uri(data_uri)
+    assert mime_type is None
+    assert len(attributes) == 0
+    assert data == b"Hello, World!"
+
+    data_uri = "data:text/plain,Hello%2C%20World%21"
+    mime_type, attributes, data = parse_data_uri(data_uri)
+    assert mime_type == "text/plain"
+    assert len(attributes) == 0
+    assert data == b"Hello, World!"
+
+    data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21"
+    mime_type, attributes, data = parse_data_uri(data_uri)
+    assert mime_type == "text/plain"
+    assert len(attributes) == 1
+    assert attributes["charset"] == "utf-8"
+    assert data == b"Hello, World!"
+
+
+def test_file_uris() -> None:
+    # Test file URI with an empty host
+    file_uri = "file:///path/to/file.txt"
+    netloc, path = file_uri_to_path(file_uri)
+    assert netloc is None
+    assert path == "/path/to/file.txt"
+
+    # Test file URI with no host
+    file_uri = "file:/path/to/file.txt"
+    netloc, path = file_uri_to_path(file_uri)
+    assert netloc is None
+    assert path == "/path/to/file.txt"
+
+    # Test file URI with localhost
+    file_uri = "file://localhost/path/to/file.txt"
+    netloc, path = file_uri_to_path(file_uri)
+    assert netloc == "localhost"
+    assert path == "/path/to/file.txt"
+
+    # Test file URI with query parameters
+    file_uri = "file:///path/to/file.txt?param=value"
+    netloc, path = file_uri_to_path(file_uri)
+    assert netloc is None
+    assert path == "/path/to/file.txt"
+
+    # Test file URI with fragment
+    file_uri = "file:///path/to/file.txt#fragment"
+    netloc, path = file_uri_to_path(file_uri)
+    assert netloc is None
+    assert path == "/path/to/file.txt"
+
+
 def test_docx_comments() -> None:
    markitdown = MarkItDown()

@@ -314,6 +389,8 @@ if __name__ == "__main__":
    """Runs this file's tests from the command line."""
    for test in [
        test_stream_info_operations,
+        test_data_uris,
+        test_file_uris,
        test_docx_comments,
        test_input_as_strings,
        test_markitdown_remote,
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@@ -3,12 +3,14 @@ import os
 import time
 import pytest
 import codecs
+import base64

+from pathlib import Path

 if __name__ == "__main__":
-    from _test_vectors import GENERAL_TEST_VECTORS
+    from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
 else:
-    from ._test_vectors import GENERAL_TEST_VECTORS
+    from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS

 from markitdown import (
    MarkItDown,
@@ -108,8 +110,8 @@ def test_convert_stream_without_hints(test_vector):
    reason="do not run tests that query external urls",
 )
@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
-def test_convert_url(test_vector):
-    """Test the conversion of a stream with no stream info."""
+def test_convert_http_uri(test_vector):
+    """Test the conversion of an HTTP:// or HTTPS:// URI."""
    markitdown = MarkItDown()

    time.sleep(1)  # Ensure we don't hit rate limits
@@ -124,16 +126,96 @@ def test_convert_url(test_vector):
        assert string not in result.markdown


+@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
+def test_convert_file_uri(test_vector):
+    """Test the conversion of a file:// URI."""
+    markitdown = MarkItDown()
+
+    result = markitdown.convert(
+        Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(),
+        url=test_vector.url,
+    )
+    for string in test_vector.must_include:
+        assert string in result.markdown
+    for string in test_vector.must_not_include:
+        assert string not in result.markdown
+
+
+@pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS)
+def test_convert_data_uri(test_vector):
+    """Test the conversion of a data URI."""
+    markitdown = MarkItDown()
+
+    data = ""
+    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
+        data = base64.b64encode(stream.read()).decode("utf-8")
+    mimetype = test_vector.mimetype
+    data_uri = f"data:{mimetype};base64,{data}"
+
+    result = markitdown.convert(
+        data_uri,
+        url=test_vector.url,
+    )
+    for string in test_vector.must_include:
+        assert string in result.markdown
+    for string in test_vector.must_not_include:
+        assert string not in result.markdown
+
+
+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_convert_keep_data_uris(test_vector):
+    """Test API functionality when keep_data_uris is enabled"""
+    markitdown = MarkItDown()
+
+    # Test local file conversion
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, test_vector.filename),
+        keep_data_uris=True,
+        url=test_vector.url,
+    )
+
+    for string in test_vector.must_include:
+        assert string in result.markdown
+    for string in test_vector.must_not_include:
+        assert string not in result.markdown
+
+
+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_convert_stream_keep_data_uris(test_vector):
+    """Test the conversion of a stream with no stream info."""
+    markitdown = MarkItDown()
+
+    stream_info = StreamInfo(
+        extension=os.path.splitext(test_vector.filename)[1],
+        mimetype=test_vector.mimetype,
+        charset=test_vector.charset,
+    )
+
+    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
+        result = markitdown.convert(
+            stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
+        )
+
+        for string in test_vector.must_include:
+            assert string in result.markdown
+        for string in test_vector.must_not_include:
+            assert string not in result.markdown
+
+
 if __name__ == "__main__":
    import sys

    """Runs this file's tests from the command line."""
+
+    # General tests
    for test_function in [
        test_guess_stream_info,
        test_convert_local,
        test_convert_stream_with_hints,
        test_convert_stream_without_hints,
-        test_convert_url,
+        test_convert_http_uri,
+        test_convert_file_uri,
+        test_convert_data_uri,
    ]:
        for test_vector in GENERAL_TEST_VECTORS:
            print(
@@ -141,4 +223,17 @@ if __name__ == "__main__":
            )
            test_function(test_vector)
            print("OK")
+
+    # Data URI tests
+    for test_function in [
+        test_convert_keep_data_uris,
+        test_convert_stream_keep_data_uris,
+    ]:
+        for test_vector in DATA_URI_TEST_VECTORS:
+            print(
+                f"Running {test_function.__name__} on {test_vector.filename}...", end=""
+            )
+            test_function(test_vector)
+            print("OK")
+
    print("All tests passed!")
Author	SHA1	Message	Date
afourney	c1f9a323ee	Bump version. (#1154 )	2025-03-24 23:26:30 -07:00
afourney	e928b43afb	convert_url renamed to convert_uri, and now handles data and file URIs (#1153 )	2025-03-24 21:43:04 -07:00
afourney	2ffe6ea591	Bump version. (#1150 )	2025-03-22 11:21:32 -07:00
afourney	efc55b260d	Bump version and resolve a console encoding error. (#1149 )	2025-03-21 09:27:25 -07:00
Yuzhong Zhang	52432bd228	Add support for preserving base64 encoded images (#1140 ) * optional reserve base64 string in markdown _CustomMarkdownify and pptx * add other converter para support * fix linter * Use kwarg to pass keep_data_uri para. Add module cli vector tests * Fixed formatting, and adjusted tests.	2025-03-20 18:50:23 -07:00
afourney	c0a511ecff	Updated docx file to include an image. (#1146 )	2025-03-20 12:25:56 -07:00
afourney	cd6aa41361	Adjust warning filters and update dependencies (#1143 ) Adjusts warning filters to be more contextual Updates dependencies for magika and youtube-transcript-api Updates the version to 0.1.0a5 in __about__.py	2025-03-19 22:09:14 -07:00
afourney	716f74dcb9	Consider anything with a charset as plain text-convertible. (#1142 )	2025-03-19 20:46:35 -07:00