diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..17c9dab 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -582,6 +582,11 @@ class PptxConverter(HtmlConverter): "\n" + self._convert(html_table).text_content.strip() + "\n" ) + # Charts + if shape.has_chart: + md_content += self._convert_chart_to_markdown(shape.chart) + + # Text areas elif shape.has_text_frame: if shape == title: @@ -616,6 +621,33 @@ class PptxConverter(HtmlConverter): return True return False + def _is_chart(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.CHART: + return True + return False + + def _convert_chart_to_markdown(self, chart): + md = "\n\n### Chart" + if chart.has_title: + md += f": {chart.chart_title.text_frame.text}" + md += "\n\n" + data = [] + category_names = [c.label for c in chart.plots[0].categories] + series_names = [s.name for s in chart.series] + data.append(["Category"] + series_names) + + for idx, category in enumerate(category_names): + row = [category] + for series in chart.series: + row.append(series.values[idx]) + data.append(row) + + markdown_table = [] + for row in data: + markdown_table.append("| " + " | ".join(map(str, row)) + " |") + header = markdown_table[0] + separator = "|" + "|".join(["---"] * len(data[0])) + "|" + return md + "\n".join([header, separator] + markdown_table[1:]) class MediaConverter(DocumentConverter): """ diff --git a/tests/test_files/test.pptx b/tests/test_files/test.pptx old mode 100755 new mode 100644 index 35eabf4..ea1bbcb Binary files a/tests/test_files/test.pptx and b/tests/test_files/test.pptx differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 94fd886..ee08300 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -57,6 +57,8 @@ PPTX_TEST_STRINGS = [ "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", "1b92870d-e3b5-4e65-8153-919f4ff45592", "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title + "2003", # chart value ] BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"