add style_map prop to MarkItDown class

This commit is contained in:
Ville Puuska
2024-12-15 17:23:57 +02:00
parent 0704b0b6ff
commit 0a7203b876
2 changed files with 14 additions and 0 deletions

View File

@@ -856,6 +856,7 @@ class MarkItDown:
requests_session: Optional[requests.Session] = None,
mlm_client: Optional[Any] = None,
mlm_model: Optional[Any] = None,
style_map: Optional[str] = None,
):
if requests_session is None:
self._requests_session = requests.Session()
@@ -864,6 +865,7 @@ class MarkItDown:
self._mlm_client = mlm_client
self._mlm_model = mlm_model
self._style_map = style_map
self._page_converters: List[DocumentConverter] = []
@@ -1038,6 +1040,9 @@ class MarkItDown:
if "mlm_model" not in _kwargs and self._mlm_model is not None:
_kwargs["mlm_model"] = self._mlm_model
if "style_map" not in _kwargs and self._style_map is not None:
_kwargs["style_map"] = self._style_map
# If we hit an error log it and keep trying
try:
res = converter.convert(local_path, **_kwargs)

View File

@@ -150,6 +150,15 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
)
for test_string in DOCX_COMMENT_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content
# Test PPTX processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
for test_string in PPTX_TEST_STRINGS: