diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index 6a5d01b..3d8e396 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -104,6 +104,12 @@ def main():
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
)
+ parser.add_argument(
+ "--keep-data-uris",
+ action="store_true",
+ help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
+ )
+
parser.add_argument("filename", nargs="?")
args = parser.parse_args()
@@ -181,9 +187,15 @@ def main():
markitdown = MarkItDown(enable_plugins=args.use_plugins)
if args.filename is None:
- result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info)
+ result = markitdown.convert_stream(
+ sys.stdin.buffer,
+ stream_info=stream_info,
+ keep_data_uris=args.keep_data_uris,
+ )
else:
- result = markitdown.convert(args.filename, stream_info=stream_info)
+ result = markitdown.convert(
+ args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
+ )
_handle_output(args, result)
diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
index 3527d28..f65b85f 100644
--- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -79,7 +79,7 @@ class BingSerpConverter(DocumentConverter):
slug.extract()
# Parse the algorithmic results
- _markdownify = _CustomMarkdownify()
+ _markdownify = _CustomMarkdownify(**kwargs)
results = list()
for result in soup.find_all(class_="b_algo"):
if not hasattr(result, "find_all"):
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index c568acb..a9c469f 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter):
style_map = kwargs.get("style_map", None)
return self._html_converter.convert_string(
- mammoth.convert_to_html(file_stream, style_map=style_map).value
+ mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
)
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
index 8a8203d..dabb0d7 100644
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter):
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
- webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+ webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else:
- webpage_text = _CustomMarkdownify().convert_soup(soup)
+ webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
assert isinstance(webpage_text, str)
diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py
index ae99c0b..d98bdfb 100644
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
+ options["keep_data_uris"] = options.get("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
@@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
return alt
# Remove dataURIs
- if src.startswith("data:"):
+ if src.startswith("data:") and not self.options["keep_data_uris"]:
src = src.split(",")[0] + "..."
return "" % (alt, src, title_part)
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
index e855382..087da32 100644
--- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -140,13 +140,20 @@ class PptxConverter(DocumentConverter):
alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text)
alt_text = re.sub(r"\s+", " ", alt_text).strip()
- # A placeholder name
- filename = re.sub(r"\W", "", shape.name) + ".jpg"
- md_content += "\n\n"
+ # If keep_data_uris is True, use base64 encoding for images
+ if kwargs.get("keep_data_uris", False):
+ blob = shape.image.blob
+ content_type = shape.image.content_type or "image/png"
+ b64_string = base64.b64encode(blob).decode("utf-8")
+ md_content += f"\n\n"
+ else:
+ # A placeholder name
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
+ md_content += "\n\n"
# Tables
if self._is_table(shape):
- md_content += self._convert_table_to_markdown(shape.table)
+ md_content += self._convert_table_to_markdown(shape.table, **kwargs)
# Charts
if shape.has_chart:
@@ -193,7 +200,7 @@ class PptxConverter(DocumentConverter):
return True
return False
- def _convert_table_to_markdown(self, table):
+ def _convert_table_to_markdown(self, table, **kwargs):
# Write the table as HTML, then convert it to Markdown
html_table = "
"
first_row = True
@@ -208,7 +215,10 @@ class PptxConverter(DocumentConverter):
first_row = False
html_table += "
"
- return self._html_converter.convert_string(html_table).markdown.strip() + "\n"
+ return (
+ self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+ + "\n"
+ )
def _convert_chart_to_markdown(self, chart):
try:
diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py
index 7c80d01..6a0e4c1 100644
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [
class RssConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
+ def __init__(self):
+ super().__init__()
+ self._kwargs = {}
+
def accepts(
self,
file_stream: BinaryIO,
@@ -82,6 +86,7 @@ class RssConverter(DocumentConverter):
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
+ self._kwargs = kwargs
doc = minidom.parse(file_stream)
feed_type = self._feed_type(doc)
@@ -166,7 +171,7 @@ class RssConverter(DocumentConverter):
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
- return _CustomMarkdownify().convert_soup(soup)
+ return _CustomMarkdownify(**self._kwargs).convert_soup(soup)
except BaseException as _:
return content
diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
index 39466c0..c0f7e0e 100644
--- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -76,11 +76,11 @@ class WikipediaConverter(DocumentConverter):
main_title = title_elm.string
# Convert the page
- webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
- body_elm
- )
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
+ **kwargs
+ ).convert_soup(body_elm)
else:
- webpage_text = _CustomMarkdownify().convert_soup(soup)
+ webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
return DocumentConverterResult(
markdown=webpage_text,
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
index 3d0e1ab..28f73a0 100644
--- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -86,7 +86,9 @@ class XlsxConverter(DocumentConverter):
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
- self._html_converter.convert_string(html_content).markdown.strip()
+ self._html_converter.convert_string(
+ html_content, **kwargs
+ ).markdown.strip()
+ "\n\n"
)
@@ -146,7 +148,9 @@ class XlsConverter(DocumentConverter):
md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
- self._html_converter.convert_string(html_content).markdown.strip()
+ self._html_converter.convert_string(
+ html_content, **kwargs
+ ).markdown.strip()
+ "\n\n"
)
diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py
index 8610108..4a7b54a 100644
--- a/packages/markitdown/tests/_test_vectors.py
+++ b/packages/markitdown/tests/_test_vectors.py
@@ -25,8 +25,11 @@ GENERAL_TEST_VECTORS = [
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+ "data:image/png;base64...",
+ ],
+ must_not_include=[
+ "",
],
- must_not_include=[],
),
FileTestVector(
filename="test.xlsx",
@@ -65,8 +68,9 @@ GENERAL_TEST_VECTORS = [
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
"2003", # chart value
+ "",
],
- must_not_include=[],
+ must_not_include=[""],
),
FileTestVector(
filename="test_outlook_msg.msg",
@@ -230,3 +234,45 @@ GENERAL_TEST_VECTORS = [
must_not_include=[],
),
]
+
+
+DATA_URI_TEST_VECTORS = [
+ FileTestVector(
+ filename="test.docx",
+ mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ charset=None,
+ url=None,
+ must_include=[
+ "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+ "49e168b7-d2ae-407f-a055-2167576f39a1",
+ "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+ "# Abstract",
+ "# Introduction",
+ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+ "",
+ ],
+ must_not_include=[
+ "data:image/png;base64...",
+ ],
+ ),
+ FileTestVector(
+ filename="test.pptx",
+ mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ charset=None,
+ url=None,
+ must_include=[
+ "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
+ "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
+ "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
+ "1b92870d-e3b5-4e65-8153-919f4ff45592",
+ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+ "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title
+ "2003", # chart value
+ "![This phrase of the caption is Human-written.]", # image caption
+ "",
+ ],
+ must_not_include=[
+ "",
+ ],
+ ),
+]
diff --git a/packages/markitdown/tests/test_cli_vectors.py b/packages/markitdown/tests/test_cli_vectors.py
index 64128d6..6030482 100644
--- a/packages/markitdown/tests/test_cli_vectors.py
+++ b/packages/markitdown/tests/test_cli_vectors.py
@@ -7,9 +7,17 @@ import locale
from typing import List
if __name__ == "__main__":
- from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+ from _test_vectors import (
+ GENERAL_TEST_VECTORS,
+ DATA_URI_TEST_VECTORS,
+ FileTestVector,
+ )
else:
- from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector
+ from ._test_vectors import (
+ GENERAL_TEST_VECTORS,
+ DATA_URI_TEST_VECTORS,
+ FileTestVector,
+ )
from markitdown import (
MarkItDown,
@@ -149,6 +157,39 @@ def test_convert_url(shared_tmp_dir, test_vector):
assert test_string not in stdout
+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:
+ """Test CLI functionality when keep_data_uris is enabled"""
+
+ output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output")
+ result = subprocess.run(
+ [
+ "python",
+ "-m",
+ "markitdown",
+ "--keep-data-uris",
+ "-o",
+ output_file,
+ os.path.join(TEST_FILES_DIR, test_vector.filename),
+ ],
+ capture_output=True,
+ text=True,
+ )
+
+ assert result.returncode == 0, f"CLI exited with error: {result.stderr}"
+ assert os.path.exists(output_file), f"Output file not created: {output_file}"
+
+ with open(output_file, "r") as f:
+ output_data = f.read()
+ for test_string in test_vector.must_include:
+ assert test_string in output_data
+ for test_string in test_vector.must_not_include:
+ assert test_string not in output_data
+
+ os.remove(output_file)
+ assert not os.path.exists(output_file), f"Output file not deleted: {output_file}"
+
+
if __name__ == "__main__":
import sys
import tempfile
@@ -156,6 +197,7 @@ if __name__ == "__main__":
"""Runs this file's tests from the command line."""
with tempfile.TemporaryDirectory() as tmp_dir:
+ # General tests
for test_function in [
test_output_to_stdout,
test_output_to_file,
@@ -169,4 +211,17 @@ if __name__ == "__main__":
)
test_function(tmp_dir, test_vector)
print("OK")
+
+ # Data URI tests
+ for test_function in [
+ test_output_to_file_with_data_uris,
+ ]:
+ for test_vector in DATA_URI_TEST_VECTORS:
+ print(
+ f"Running {test_function.__name__} on {test_vector.filename}...",
+ end="",
+ )
+ test_function(tmp_dir, test_vector)
+ print("OK")
+
print("All tests passed!")
diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py
index 9afffa5..09e4a2b 100644
--- a/packages/markitdown/tests/test_module_vectors.py
+++ b/packages/markitdown/tests/test_module_vectors.py
@@ -6,9 +6,9 @@ import codecs
if __name__ == "__main__":
- from _test_vectors import GENERAL_TEST_VECTORS
+ from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
else:
- from ._test_vectors import GENERAL_TEST_VECTORS
+ from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS
from markitdown import (
MarkItDown,
@@ -124,10 +124,52 @@ def test_convert_url(test_vector):
assert string not in result.markdown
+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_convert_with_data_uris(test_vector):
+ """Test API functionality when keep_data_uris is enabled"""
+ markitdown = MarkItDown()
+
+ # Test local file conversion
+ result = markitdown.convert(
+ os.path.join(TEST_FILES_DIR, test_vector.filename),
+ keep_data_uris=True,
+ url=test_vector.url,
+ )
+
+ for string in test_vector.must_include:
+ assert string in result.markdown
+ for string in test_vector.must_not_include:
+ assert string not in result.markdown
+
+
+@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS)
+def test_convert_stream_with_data_uris(test_vector):
+ """Test the conversion of a stream with no stream info."""
+ markitdown = MarkItDown()
+
+ stream_info = StreamInfo(
+ extension=os.path.splitext(test_vector.filename)[1],
+ mimetype=test_vector.mimetype,
+ charset=test_vector.charset,
+ )
+
+ with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream:
+ result = markitdown.convert(
+ stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url
+ )
+
+ for string in test_vector.must_include:
+ assert string in result.markdown
+ for string in test_vector.must_not_include:
+ assert string not in result.markdown
+
+
if __name__ == "__main__":
import sys
"""Runs this file's tests from the command line."""
+
+ # General tests
for test_function in [
test_guess_stream_info,
test_convert_local,
@@ -141,4 +183,17 @@ if __name__ == "__main__":
)
test_function(test_vector)
print("OK")
+
+ # Data URI tests
+ for test_function in [
+ test_convert_with_data_uris,
+ test_convert_stream_with_data_uris,
+ ]:
+ for test_vector in DATA_URI_TEST_VECTORS:
+ print(
+ f"Running {test_function.__name__} on {test_vector.filename}...", end=""
+ )
+ test_function(test_vector)
+ print("OK")
+
print("All tests passed!")