Bump version and resolve a console encoding error. (#1149 )

Add support for preserving base64 encoded images (#1140 )
* optional reserve base64 string in markdown _CustomMarkdownify and pptx * add other converter para support * fix linter * Use *kwarg to pass keep_data_uri para. * Add module cli vector tests * Fixed formatting, and adjusted tests.
2025-03-21 09:27:25 -07:00 · 2025-03-20 18:50:23 -07:00 · 2025-03-20 12:25:56 -07:00
14 changed files with 223 additions and 29 deletions
--- a/packages/markitdown/src/markitdown/about.py
+++ b/packages/markitdown/src/markitdown/about.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.0a5"
+__version__ = "0.1.0a6"
--- a/packages/markitdown/src/markitdown/main.py
+++ b/packages/markitdown/src/markitdown/main.py
@@ -4,6 +4,7 @@
 import argparse
 import sys
 import codecs
+import locale
 from textwrap import dedent
 from importlib.metadata import entry_points
 from .__about__ import __version__
@@ -104,6 +105,12 @@ def main():
        help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
    )

+    parser.add_argument(
+        "--keep-data-uris",
+        action="store_true",
+        help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
+    )
+
    parser.add_argument("filename", nargs="?")
    args = parser.parse_args()

@@ -181,9 +188,15 @@ def main():
        markitdown = MarkItDown(enable_plugins=args.use_plugins)

    if args.filename is None:
-        result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
+        result = markitdown.convert_stream(
+            sys.stdin.buffer,
+            stream_info=stream_info,
+            keep_data_uris=args.keep_data_uris,
+        )
    else:
-        result = markitdown.convert(args.filename, stream_info=stream_info)
+        result = markitdown.convert(
+            args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+        )

    _handle_output(args, result)

@@ -192,9 +205,14 @@ def _handle_output(args, result: DocumentConverterResult):
    """Handle output to stdout or file"""
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
-            f.write(result.text_content)
+            f.write(result.markdown)
    else:
-        print(result.text_content)
+        # Handle stdout encoding errors more gracefully
+        print(
+            result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
+                sys.stdout.encoding
+            )
+        )


 def _exit_with_error(message: str):
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -79,7 +79,7 @@ class BingSerpConverter(DocumentConverter):
            slug.extract()

        # Parse the algorithmic results
-        _markdownify = _CustomMarkdownify()
+        _markdownify = _CustomMarkdownify(**kwargs)
        results = list()
        for result in soup.find_all(class_="b_algo"):
            if not hasattr(result, "find_all"):
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):

        style_map = kwargs.get("style_map", None)
        return self._html_converter.convert_string(
-            mammoth.convert_to_html(file_stream, style_map=style_map).value
+            mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
        )
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
        body_elm = soup.find("body")
        webpage_text = ""
        if body_elm:
-            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)

        assert isinstance(webpage_text, str)

--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+        options["keep_data_uris"] = options.get("keep_data_uris", False)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)

@@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
            return alt

        # Remove dataURIs
-        if src.startswith("data:"):
+        if src.startswith("data:") and not self.options["keep_data_uris"]:
            src = src.split(",")[0] + "..."

        return "![%s](%s%s)" % (alt, src, title_part)
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -140,13 +140,20 @@ class PptxConverter(DocumentConverter):
                    alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
                    alt_text = re.sub(r"\s+", " ", alt_text).strip()

-                    # A placeholder name
-                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
-                    md_content += "\n![" + alt_text + "](" + filename + ")\n"
+                    # If keep_data_uris is True, use base64 encoding for images
+                    if kwargs.get("keep_data_uris", False):
+                        blob = shape.image.blob
+                        content_type = shape.image.content_type or "image/png"
+                        b64_string = base64.b64encode(blob).decode("utf-8")
+                        md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n"
+                    else:
+                        # A placeholder name
+                        filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                        md_content += "\n![" + alt_text + "](" + filename + ")\n"

                # Tables
                if self._is_table(shape):
-                    md_content += self._convert_table_to_markdown(shape.table)
+                    md_content += self._convert_table_to_markdown(shape.table, **kwargs)

                # Charts
                if shape.has_chart:
@@ -193,7 +200,7 @@ class PptxConverter(DocumentConverter):
            return True
        return False

-    def _convert_table_to_markdown(self, table):
+    def _convert_table_to_markdown(self, table, **kwargs):
        # Write the table as HTML, then convert it to Markdown
        html_table = "<html><body><table>"
        first_row = True
@@ -208,7 +215,10 @@ class PptxConverter(DocumentConverter):
            first_row = False
        html_table += "</table></body></html>"

-        return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
+        return (
+            self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+            + "\n"
+        )

    def _convert_chart_to_markdown(self, chart):
        try:
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [
 class RssConverter(DocumentConverter):
    """Convert RSS / Atom type to markdown"""

+    def __init__(self):
+        super().__init__()
+        self._kwargs = {}
+
    def accepts(
        self,
        file_stream: BinaryIO,
@@ -82,6 +86,7 @@ class RssConverter(DocumentConverter):
        stream_info: StreamInfo,
        **kwargs: Any,  # Options to pass to the converter
    ) -> DocumentConverterResult:
+        self._kwargs = kwargs
        doc = minidom.parse(file_stream)
        feed_type = self._feed_type(doc)

@@ -166,7 +171,7 @@ class RssConverter(DocumentConverter):
        try:
            # using bs4 because many RSS feeds have HTML-styled content
            soup = BeautifulSoup(content, "html.parser")
-            return _CustomMarkdownify().convert_soup(soup)
+            return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
        except BaseException as _:
            return content

--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -76,11 +76,11 @@ class WikipediaConverter(DocumentConverter):
                main_title = title_elm.string

            # Convert the page
-            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
-                body_elm
-            )
+            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
+                **kwargs
+            ).convert_soup(body_elm)
        else:
-            webpage_text = _CustomMarkdownify().convert_soup(soup)
+            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)

        return DocumentConverterResult(
            markdown=webpage_text,
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -86,7 +86,9 @@ class XlsxConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(html_content).markdown.strip()
+                self._html_converter.convert_string(
+                    html_content, **kwargs
+                ).markdown.strip()
                + "\n\n"
            )

@@ -146,7 +148,9 @@ class XlsConverter(DocumentConverter):
            md_content += f"## {s}\n"
            html_content = sheets[s].to_html(index=False)
            md_content += (
-                self._html_converter.convert_string(html_content).markdown.strip()
+                self._html_converter.convert_string(
+                    html_content, **kwargs
+                ).markdown.strip()
                + "\n\n"
            )

--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -25,8 +25,11 @@ GENERAL_TEST_VECTORS = [
            "# Abstract",
            "# Introduction",
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+            "data:image/png;base64...",
+        ],
+        must_not_include=[
+            "data:image/png;base64,iVBORw0KGgoAAAANSU",
        ],
-        must_not_include=[],
    ),
    FileTestVector(
        filename="test.xlsx",
@@ -65,8 +68,9 @@ GENERAL_TEST_VECTORS = [
            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
            "2003",  # chart value
+            "![This phrase of the caption is Human-written.](Picture4.jpg)",
        ],
-        must_not_include=[],
+        must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"],
    ),
    FileTestVector(
        filename="test_outlook_msg.msg",
@@ -230,3 +234,45 @@ GENERAL_TEST_VECTORS = [
        must_not_include=[],
    ),
 ]
+
+
+DATA_URI_TEST_VECTORS = [
+    FileTestVector(
+        filename="test.docx",
+        mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        charset=None,
+        url=None,
+        must_include=[
+            "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+            "49e168b7-d2ae-407f-a055-2167576f39a1",
+            "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+            "# Abstract",
+            "# Introduction",
+            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+            "data:image/png;base64,iVBORw0KGgoAAAANSU",
+        ],
+        must_not_include=[
+            "data:image/png;base64...",
+        ],
+    ),
+    FileTestVector(
+        filename="test.pptx",
+        mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        charset=None,
+        url=None,
+        must_include=[
+            "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
+            "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
+            "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
+            "1b92870d-e3b5-4e65-8153-919f4ff45592",
+            "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+            "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
+            "2003",  # chart value
+            "![This phrase of the caption is Human-written.]",  # image caption
+            "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE",
+        ],
+        must_not_include=[
+            "![This phrase of the caption is Human-written.](Picture4.jpg)",
+        ],
+    ),
+]
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@@ -7,9 +7,17 @@ import locale
 from typing import List

 if __name__ == "__main__":
-    from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+    from _test_vectors import (
+        GENERAL_TEST_VECTORS,
+        DATA_URI_TEST_VECTORS,
+        FileTestVector,
+    )
 else:
-    from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+    from ._test_vectors import (
+        GENERAL_TEST_VECTORS,
+        DATA_URI_TEST_VECTORS,
+        FileTestVector,
+    )

 from markitdown import (
    MarkItDown,
@@ -149,6 +157,39 @@ def test_convert_url(shared_tmp_dir, test_vector):
        assert test_string not in stdout


+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
+    """Test CLI functionality when keep_data_uris is enabled"""
+
+    output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
+    result = subprocess.run(
+        [
+            "python",
+            "-m",
+            "markitdown",
+            "--keep-data-uris",
+            "-o",
+            output_file,
+            os.path.join(TEST_FILES_DIR, test_vector.filename),
+        ],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+    assert os.path.exists(output_file), f"Output file not created: {output_file}"
+
+    with open(output_file, "r") as f:
+        output_data = f.read()
+        for test_string in test_vector.must_include:
+            assert test_string in output_data
+        for test_string in test_vector.must_not_include:
+            assert test_string not in output_data
+
+    os.remove(output_file)
+    assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
+
+
 if __name__ == "__main__":
    import sys
    import tempfile
@@ -156,6 +197,7 @@ if __name__ == "__main__":
    """Runs this file's tests from the command line."""

    with tempfile.TemporaryDirectory() as tmp_dir:
+        # General tests
        for test_function in [
            test_output_to_stdout,
            test_output_to_file,
@@ -169,4 +211,17 @@ if __name__ == "__main__":
                )
                test_function(tmp_dir, test_vector)
                print("OK")
+
+        # Data URI tests
+        for test_function in [
+            test_output_to_file_with_data_uris,
+        ]:
+            for test_vector in DATA_URI_TEST_VECTORS:
+                print(
+                    f"Running {test_function.__name__} on {test_vector.filename}...",
+                    end="",
+                )
+                test_function(tmp_dir, test_vector)
+                print("OK")
+
    print("All tests passed!")
--- a/packages/markitdown/tests/test_files/test.docx
+++ b/packages/markitdown/tests/test_files/test.docx
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@@ -6,9 +6,9 @@ import codecs


 if __name__ == "__main__":
-    from _test_vectors import GENERAL_TEST_VECTORS
+    from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
 else:
-    from ._test_vectors import GENERAL_TEST_VECTORS
+    from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS

 from markitdown import (
    MarkItDown,
@@ -124,10 +124,52 @@ def test_convert_url(test_vector):
        assert string not in result.markdown


+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_convert_with_data_uris(test_vector):
+    """Test API functionality when keep_data_uris is enabled"""
+    markitdown = MarkItDown()
+
+    # Test local file conversion
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, test_vector.filename),
+        keep_data_uris=True,
+        url=test_vector.url,
+    )
+
+    for string in test_vector.must_include:
+        assert string in result.markdown
+    for string in test_vector.must_not_include:
+        assert string not in result.markdown
+
+
+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_convert_stream_with_data_uris(test_vector):
+    """Test the conversion of a stream with no stream info."""
+    markitdown = MarkItDown()
+
+    stream_info = StreamInfo(
+        extension=os.path.splitext(test_vector.filename)[1],
+        mimetype=test_vector.mimetype,
+        charset=test_vector.charset,
+    )
+
+    with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
+        result = markitdown.convert(
+            stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
+        )
+
+        for string in test_vector.must_include:
+            assert string in result.markdown
+        for string in test_vector.must_not_include:
+            assert string not in result.markdown
+
+
 if __name__ == "__main__":
    import sys

    """Runs this file's tests from the command line."""
+
+    # General tests
    for test_function in [
        test_guess_stream_info,
        test_convert_local,
@@ -141,4 +183,17 @@ if __name__ == "__main__":
            )
            test_function(test_vector)
            print("OK")
+
+    # Data URI tests
+    for test_function in [
+        test_convert_with_data_uris,
+        test_convert_stream_with_data_uris,
+    ]:
+        for test_vector in DATA_URI_TEST_VECTORS:
+            print(
+                f"Running {test_function.__name__} on {test_vector.filename}...", end=""
+            )
+            test_function(test_vector)
+            print("OK")
+
    print("All tests passed!")
Author	SHA1	Message	Date
afourney	efc55b260d	Bump version and resolve a console encoding error. (#1149 )	2025-03-21 09:27:25 -07:00
Yuzhong Zhang	52432bd228	Add support for preserving base64 encoded images (#1140 ) * optional reserve base64 string in markdown _CustomMarkdownify and pptx * add other converter para support * fix linter * Use kwarg to pass keep_data_uri para. Add module cli vector tests * Fixed formatting, and adjusted tests.	2025-03-20 18:50:23 -07:00
afourney	c0a511ecff	Updated docx file to include an image. (#1146 )	2025-03-20 12:25:56 -07:00