diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6a5d01b..3d8e396 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -104,6 +104,12 @@ def main(): help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.", ) + parser.add_argument( + "--keep-data-uris", + action="store_true", + help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -181,9 +187,15 @@ def main(): markitdown = MarkItDown(enable_plugins=args.use_plugins) if args.filename is None: - result = markitdown.convert_stream(sys.stdin.buffer, stream_info=stream_info) + result = markitdown.convert_stream( + sys.stdin.buffer, + stream_info=stream_info, + keep_data_uris=args.keep_data_uris, + ) else: - result = markitdown.convert(args.filename, stream_info=stream_info) + result = markitdown.convert( + args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris + ) _handle_output(args, result) diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py index 3527d28..f65b85f 100644 --- a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py +++ b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py @@ -79,7 +79,7 @@ class BingSerpConverter(DocumentConverter): slug.extract() # Parse the algorithmic results - _markdownify = _CustomMarkdownify() + _markdownify = _CustomMarkdownify(**kwargs) results = list() for result in soup.find_all(class_="b_algo"): if not hasattr(result, "find_all"): diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index c568acb..a9c469f 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -73,5 +73,5 @@ class DocxConverter(HtmlConverter): style_map = kwargs.get("style_map", None) return self._html_converter.convert_string( - mammoth.convert_to_html(file_stream, style_map=style_map).value + mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs ) diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index 8a8203d..dabb0d7 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -56,9 +56,9 @@ class HtmlConverter(DocumentConverter): body_elm = soup.find("body") webpage_text = "" if body_elm: - webpage_text = _CustomMarkdownify().convert_soup(body_elm) + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) else: - webpage_text = _CustomMarkdownify().convert_soup(soup) + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) assert isinstance(webpage_text, str) diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index ae99c0b..d98bdfb 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -17,6 +17,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) + options["keep_data_uris"] = options.get("keep_data_uris", False) # Explicitly cast options to the expected type if necessary super().__init__(**options) @@ -101,7 +102,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): return alt # Remove dataURIs - if src.startswith("data:"): + if src.startswith("data:") and not self.options["keep_data_uris"]: src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index e855382..087da32 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -140,13 +140,20 @@ class PptxConverter(DocumentConverter): alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) alt_text = re.sub(r"\s+", " ", alt_text).strip() - # A placeholder name - filename = re.sub(r"\W", "", shape.name) + ".jpg" - md_content += "\n![" + alt_text + "](" + filename + ")\n" + # If keep_data_uris is True, use base64 encoding for images + if kwargs.get("keep_data_uris", False): + blob = shape.image.blob + content_type = shape.image.content_type or "image/png" + b64_string = base64.b64encode(blob).decode("utf-8") + md_content += f"\n![{alt_text}](data:{content_type};base64,{b64_string})\n" + else: + # A placeholder name + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + alt_text + "](" + filename + ")\n" # Tables if self._is_table(shape): - md_content += self._convert_table_to_markdown(shape.table) + md_content += self._convert_table_to_markdown(shape.table, **kwargs) # Charts if shape.has_chart: @@ -193,7 +200,7 @@ class PptxConverter(DocumentConverter): return True return False - def _convert_table_to_markdown(self, table): + def _convert_table_to_markdown(self, table, **kwargs): # Write the table as HTML, then convert it to Markdown html_table = "" first_row = True @@ -208,7 +215,10 @@ class PptxConverter(DocumentConverter): first_row = False html_table += "
" - return self._html_converter.convert_string(html_table).markdown.strip() + "\n" + return ( + self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + + "\n" + ) def _convert_chart_to_markdown(self, chart): try: diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 7c80d01..6a0e4c1 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -28,6 +28,10 @@ CANDIDATE_FILE_EXTENSIONS = [ class RssConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" + def __init__(self): + super().__init__() + self._kwargs = {} + def accepts( self, file_stream: BinaryIO, @@ -82,6 +86,7 @@ class RssConverter(DocumentConverter): stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + self._kwargs = kwargs doc = minidom.parse(file_stream) feed_type = self._feed_type(doc) @@ -166,7 +171,7 @@ class RssConverter(DocumentConverter): try: # using bs4 because many RSS feeds have HTML-styled content soup = BeautifulSoup(content, "html.parser") - return _CustomMarkdownify().convert_soup(soup) + return _CustomMarkdownify(**self._kwargs).convert_soup(soup) except BaseException as _: return content diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py index 39466c0..c0f7e0e 100644 --- a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py +++ b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py @@ -76,11 +76,11 @@ class WikipediaConverter(DocumentConverter): main_title = title_elm.string # Convert the page - webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup( - body_elm - ) + webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify( + **kwargs + ).convert_soup(body_elm) else: - webpage_text = _CustomMarkdownify().convert_soup(soup) + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) return DocumentConverterResult( markdown=webpage_text, diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 3d0e1ab..28f73a0 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -86,7 +86,9 @@ class XlsxConverter(DocumentConverter): md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( - self._html_converter.convert_string(html_content).markdown.strip() + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + "\n\n" ) @@ -146,7 +148,9 @@ class XlsConverter(DocumentConverter): md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( - self._html_converter.convert_string(html_content).markdown.strip() + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + "\n\n" ) diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 8610108..4a7b54a 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -25,8 +25,11 @@ GENERAL_TEST_VECTORS = [ "# Abstract", "# Introduction", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "data:image/png;base64...", + ], + must_not_include=[ + "data:image/png;base64,iVBORw0KGgoAAAANSU", ], - must_not_include=[], ), FileTestVector( filename="test.xlsx", @@ -65,8 +68,9 @@ GENERAL_TEST_VECTORS = [ "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title "2003", # chart value + "![This phrase of the caption is Human-written.](Picture4.jpg)", ], - must_not_include=[], + must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"], ), FileTestVector( filename="test_outlook_msg.msg", @@ -230,3 +234,45 @@ GENERAL_TEST_VECTORS = [ must_not_include=[], ), ] + + +DATA_URI_TEST_VECTORS = [ + FileTestVector( + filename="test.docx", + mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + charset=None, + url=None, + must_include=[ + "314b0a30-5b04-470b-b9f7-eed2c2bec74a", + "49e168b7-d2ae-407f-a055-2167576f39a1", + "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", + "# Abstract", + "# Introduction", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "data:image/png;base64,iVBORw0KGgoAAAANSU", + ], + must_not_include=[ + "data:image/png;base64...", + ], + ), + FileTestVector( + filename="test.pptx", + mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", + charset=None, + url=None, + must_include=[ + "2cdda5c8-e50e-4db4-b5f0-9722a649f455", + "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", + "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", + "1b92870d-e3b5-4e65-8153-919f4ff45592", + "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title + "2003", # chart value + "![This phrase of the caption is Human-written.]", # image caption + "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE", + ], + must_not_include=[ + "![This phrase of the caption is Human-written.](Picture4.jpg)", + ], + ), +] diff --git a/packages/markitdown/tests/test_cli_vectors.py b/packages/markitdown/tests/test_cli_vectors.py index 64128d6..6030482 100644 --- a/packages/markitdown/tests/test_cli_vectors.py +++ b/packages/markitdown/tests/test_cli_vectors.py @@ -7,9 +7,17 @@ import locale from typing import List if __name__ == "__main__": - from _test_vectors import GENERAL_TEST_VECTORS, FileTestVector + from _test_vectors import ( + GENERAL_TEST_VECTORS, + DATA_URI_TEST_VECTORS, + FileTestVector, + ) else: - from ._test_vectors import GENERAL_TEST_VECTORS, FileTestVector + from ._test_vectors import ( + GENERAL_TEST_VECTORS, + DATA_URI_TEST_VECTORS, + FileTestVector, + ) from markitdown import ( MarkItDown, @@ -149,6 +157,39 @@ def test_convert_url(shared_tmp_dir, test_vector): assert test_string not in stdout +@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) +def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: + """Test CLI functionality when keep_data_uris is enabled""" + + output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") + result = subprocess.run( + [ + "python", + "-m", + "markitdown", + "--keep-data-uris", + "-o", + output_file, + os.path.join(TEST_FILES_DIR, test_vector.filename), + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + assert os.path.exists(output_file), f"Output file not created: {output_file}" + + with open(output_file, "r") as f: + output_data = f.read() + for test_string in test_vector.must_include: + assert test_string in output_data + for test_string in test_vector.must_not_include: + assert test_string not in output_data + + os.remove(output_file) + assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" + + if __name__ == "__main__": import sys import tempfile @@ -156,6 +197,7 @@ if __name__ == "__main__": """Runs this file's tests from the command line.""" with tempfile.TemporaryDirectory() as tmp_dir: + # General tests for test_function in [ test_output_to_stdout, test_output_to_file, @@ -169,4 +211,17 @@ if __name__ == "__main__": ) test_function(tmp_dir, test_vector) print("OK") + + # Data URI tests + for test_function in [ + test_output_to_file_with_data_uris, + ]: + for test_vector in DATA_URI_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", + end="", + ) + test_function(tmp_dir, test_vector) + print("OK") + print("All tests passed!") diff --git a/packages/markitdown/tests/test_module_vectors.py b/packages/markitdown/tests/test_module_vectors.py index 9afffa5..09e4a2b 100644 --- a/packages/markitdown/tests/test_module_vectors.py +++ b/packages/markitdown/tests/test_module_vectors.py @@ -6,9 +6,9 @@ import codecs if __name__ == "__main__": - from _test_vectors import GENERAL_TEST_VECTORS + from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS else: - from ._test_vectors import GENERAL_TEST_VECTORS + from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS from markitdown import ( MarkItDown, @@ -124,10 +124,52 @@ def test_convert_url(test_vector): assert string not in result.markdown +@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) +def test_convert_with_data_uris(test_vector): + """Test API functionality when keep_data_uris is enabled""" + markitdown = MarkItDown() + + # Test local file conversion + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, test_vector.filename), + keep_data_uris=True, + url=test_vector.url, + ) + + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + +@pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) +def test_convert_stream_with_data_uris(test_vector): + """Test the conversion of a stream with no stream info.""" + markitdown = MarkItDown() + + stream_info = StreamInfo( + extension=os.path.splitext(test_vector.filename)[1], + mimetype=test_vector.mimetype, + charset=test_vector.charset, + ) + + with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: + result = markitdown.convert( + stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url + ) + + for string in test_vector.must_include: + assert string in result.markdown + for string in test_vector.must_not_include: + assert string not in result.markdown + + if __name__ == "__main__": import sys """Runs this file's tests from the command line.""" + + # General tests for test_function in [ test_guess_stream_info, test_convert_local, @@ -141,4 +183,17 @@ if __name__ == "__main__": ) test_function(test_vector) print("OK") + + # Data URI tests + for test_function in [ + test_convert_with_data_uris, + test_convert_stream_with_data_uris, + ]: + for test_vector in DATA_URI_TEST_VECTORS: + print( + f"Running {test_function.__name__} on {test_vector.filename}...", end="" + ) + test_function(test_vector) + print("OK") + print("All tests passed!")