feat: sort pptx shapes to be parsed in top-to-bottom, left-to-right order (#1104)
* Sort PPTX shapes to be read in top-to-bottom, left-to-right order
Referenced from 39bef65b31/pptx2md/parser.py (L249)
* Update README.md
* Fixed formatting.
* Added missing import
This commit is contained in:
@@ -14,7 +14,7 @@ MarkItDown is a lightweight Python utility for converting various files to Markd
|
||||
At present, MarkItDown supports:
|
||||
|
||||
- PDF
|
||||
- PowerPoint
|
||||
- PowerPoint (reading in top-to-bottom, left-to-right order)
|
||||
- Word
|
||||
- Excel
|
||||
- Images (EXIF metadata and OCR)
|
||||
|
||||
@@ -6,6 +6,7 @@ import re
|
||||
import html
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
from operator import attrgetter
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._llm_caption import llm_caption
|
||||
@@ -160,10 +161,12 @@ class PptxConverter(DocumentConverter):
|
||||
|
||||
# Group Shapes
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||
for subshape in shape.shapes:
|
||||
sorted_shapes = sorted(shape.shapes, key=attrgetter("top", "left"))
|
||||
for subshape in sorted_shapes:
|
||||
get_shape_content(subshape, **kwargs)
|
||||
|
||||
for shape in slide.shapes:
|
||||
sorted_shapes = sorted(slide.shapes, key=attrgetter("top", "left"))
|
||||
for shape in sorted_shapes:
|
||||
get_shape_content(shape, **kwargs)
|
||||
|
||||
md_content = md_content.strip()
|
||||
|
||||
Reference in New Issue
Block a user