diff --git a/README.md b/README.md index 0aa788c..40f4b82 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd At present, MarkItDown supports: - PDF -- PowerPoint +- PowerPoint (reading in top-to-bottom, left-to-right order) - Word - Excel - Images (EXIF metadata and OCR) diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py index bea1226..bcde6c9 100644 --- a/packages/markitdown/src/markitdown/converters/_pptx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pptx_converter.py @@ -6,6 +6,7 @@ import re import html from typing import BinaryIO, Any +from operator import attrgetter from ._html_converter import HtmlConverter from ._llm_caption import llm_caption @@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter): # Group Shapes if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: - for subshape in shape.shapes: + sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left")) + for subshape in sorted_shapes: get_shape_content(subshape, **kwargs) - for shape in slide.shapes: + sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left")) + for shape in sorted_shapes: get_shape_content(shape, **kwargs) md_content = md_content.strip()