feat: sort pptx shapes to be parsed in top-to-bottom, left-to-right order (#1104)

* Sort PPTX shapes to be read in top-to-bottom, left-to-right order

Referenced from 39bef65b31/pptx2md/parser.py (L249)

* Update README.md
* Fixed formatting.
* Added missing import
This commit is contained in:
Richard Ye
2025-03-07 18:45:14 -05:00
committed by GitHub
parent 82d84e3edd
commit 0229ff6cb7
2 changed files with 6 additions and 3 deletions

View File

@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
At present, MarkItDown supports: At present, MarkItDown supports:
- PDF - PDF
- PowerPoint - PowerPoint (reading in top-to-bottom, left-to-right order)
- Word - Word
- Excel - Excel
- Images (EXIF metadata and OCR) - Images (EXIF metadata and OCR)

View File

@@ -6,6 +6,7 @@ import re
import html import html
from typing import BinaryIO, Any from typing import BinaryIO, Any
from operator import attrgetter
from ._html_converter import HtmlConverter from ._html_converter import HtmlConverter
from ._llm_caption import llm_caption from ._llm_caption import llm_caption
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
# Group Shapes # Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
for subshape in shape.shapes: sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs) get_shape_content(subshape, **kwargs)
for shape in slide.shapes: sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
for shape in sorted_shapes:
get_shape_content(shape, **kwargs) get_shape_content(shape, **kwargs)
md_content = md_content.strip() md_content = md_content.strip()