From 3fcd48cdfc651cbf508071c8d2fb7d82aeb075de Mon Sep 17 00:00:00 2001
From: Sathindu <11785398+sathinduga@users.noreply.github.com>
Date: Fri, 28 Mar 2025 18:36:38 -0400
Subject: [PATCH] feat: render math equations in .docx documents (#1160)
* feat: math equation rendering in .docx files
* fix: import fix on .docx pre processing
* test: add test cases for docx equation rendering
* docs: add ThirdPartyNotices.md
* refactor: reformatted with black
---
packages/markitdown/ThirdPartyNotices.md | 232 ++++++++++
packages/markitdown/pyproject.toml | 3 +-
.../markitdown/converter_utils/__init__.py | 0
.../converter_utils/docx/__init__.py | 0
.../converter_utils/docx/math/__init__.py | 0
.../converter_utils/docx/math/latex_dict.py | 273 ++++++++++++
.../converter_utils/docx/math/omml.py | 400 ++++++++++++++++++
.../converter_utils/docx/pre_process.py | 156 +++++++
.../markitdown/converters/_docx_converter.py | 5 +-
.../tests/test_files/equations.docx | Bin 0 -> 15235 bytes
packages/markitdown/tests/test_module_misc.py | 14 +
11 files changed, 1081 insertions(+), 2 deletions(-)
create mode 100644 packages/markitdown/ThirdPartyNotices.md
create mode 100644 packages/markitdown/src/markitdown/converter_utils/__init__.py
create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/__init__.py
create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py
create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py
create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py
create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
create mode 100644 packages/markitdown/tests/test_files/equations.docx
diff --git a/packages/markitdown/ThirdPartyNotices.md b/packages/markitdown/ThirdPartyNotices.md
new file mode 100644
index 0000000..44edd8f
--- /dev/null
+++ b/packages/markitdown/ThirdPartyNotices.md
@@ -0,0 +1,232 @@
+# THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+**Do Not Translate or Localize**
+
+This project incorporates components from the projects listed below. The original copyright notices and the licenses
+under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly
+granted herein, whether by implication, estoppel or otherwise.
+
+1.dwml (https://github.com/xiilei/dwml)
+
+dwml NOTICES AND INFORMATION BEGIN HERE
+
+-----------------------------------------
+
+NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including
+placeholders for the copyright owner and year.
+
+NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented.
+The following section summarizes these changes. The full details are available in the MarkItDown source code
+repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160)
+
+This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which
+lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code
+according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of
+the file is not used.
+
+-----------------------------------------
+
+Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+-----------------------------------------
+END OF dwml NOTICES AND INFORMATION
\ No newline at end of file
diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
index 9136108..79f67d2 100644
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -38,6 +38,7 @@ all = [
"pandas",
"openpyxl",
"xlrd",
+ "lxml",
"pdfminer.six",
"olefile",
"pydub",
@@ -47,7 +48,7 @@ all = [
"azure-identity"
]
pptx = ["python-pptx"]
-docx = ["mammoth"]
+docx = ["mammoth", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
diff --git a/packages/markitdown/src/markitdown/converter_utils/__init__.py b/packages/markitdown/src/markitdown/converter_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py
new file mode 100644
index 0000000..9b47382
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py
@@ -0,0 +1,273 @@
+# -*- coding: utf-8 -*-
+
+"""
+Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
+On 25/03/2025
+"""
+
+from __future__ import unicode_literals
+
+CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
+
+BLANK = ""
+BACKSLASH = "\\"
+ALN = "&"
+
+CHR = {
+ # Unicode : Latex Math Symbols
+ # Top accents
+ "\u0300": "\\grave{{{0}}}",
+ "\u0301": "\\acute{{{0}}}",
+ "\u0302": "\\hat{{{0}}}",
+ "\u0303": "\\tilde{{{0}}}",
+ "\u0304": "\\bar{{{0}}}",
+ "\u0305": "\\overbar{{{0}}}",
+ "\u0306": "\\breve{{{0}}}",
+ "\u0307": "\\dot{{{0}}}",
+ "\u0308": "\\ddot{{{0}}}",
+ "\u0309": "\\ovhook{{{0}}}",
+ "\u030a": "\\ocirc{{{0}}}}",
+ "\u030c": "\\check{{{0}}}}",
+ "\u0310": "\\candra{{{0}}}",
+ "\u0312": "\\oturnedcomma{{{0}}}",
+ "\u0315": "\\ocommatopright{{{0}}}",
+ "\u031a": "\\droang{{{0}}}",
+ "\u0338": "\\not{{{0}}}",
+ "\u20d0": "\\leftharpoonaccent{{{0}}}",
+ "\u20d1": "\\rightharpoonaccent{{{0}}}",
+ "\u20d2": "\\vertoverlay{{{0}}}",
+ "\u20d6": "\\overleftarrow{{{0}}}",
+ "\u20d7": "\\vec{{{0}}}",
+ "\u20db": "\\dddot{{{0}}}",
+ "\u20dc": "\\ddddot{{{0}}}",
+ "\u20e1": "\\overleftrightarrow{{{0}}}",
+ "\u20e7": "\\annuity{{{0}}}",
+ "\u20e9": "\\widebridgeabove{{{0}}}",
+ "\u20f0": "\\asteraccent{{{0}}}",
+ # Bottom accents
+ "\u0330": "\\wideutilde{{{0}}}",
+ "\u0331": "\\underbar{{{0}}}",
+ "\u20e8": "\\threeunderdot{{{0}}}",
+ "\u20ec": "\\underrightharpoondown{{{0}}}",
+ "\u20ed": "\\underleftharpoondown{{{0}}}",
+ "\u20ee": "\\underledtarrow{{{0}}}",
+ "\u20ef": "\\underrightarrow{{{0}}}",
+ # Over | group
+ "\u23b4": "\\overbracket{{{0}}}",
+ "\u23dc": "\\overparen{{{0}}}",
+ "\u23de": "\\overbrace{{{0}}}",
+ # Under| group
+ "\u23b5": "\\underbracket{{{0}}}",
+ "\u23dd": "\\underparen{{{0}}}",
+ "\u23df": "\\underbrace{{{0}}}",
+}
+
+CHR_BO = {
+ # Big operators,
+ "\u2140": "\\Bbbsum",
+ "\u220f": "\\prod",
+ "\u2210": "\\coprod",
+ "\u2211": "\\sum",
+ "\u222b": "\\int",
+ "\u22c0": "\\bigwedge",
+ "\u22c1": "\\bigvee",
+ "\u22c2": "\\bigcap",
+ "\u22c3": "\\bigcup",
+ "\u2a00": "\\bigodot",
+ "\u2a01": "\\bigoplus",
+ "\u2a02": "\\bigotimes",
+}
+
+T = {
+ "\u2192": "\\rightarrow ",
+ # Greek letters
+ "\U0001d6fc": "\\alpha ",
+ "\U0001d6fd": "\\beta ",
+ "\U0001d6fe": "\\gamma ",
+ "\U0001d6ff": "\\theta ",
+ "\U0001d700": "\\epsilon ",
+ "\U0001d701": "\\zeta ",
+ "\U0001d702": "\\eta ",
+ "\U0001d703": "\\theta ",
+ "\U0001d704": "\\iota ",
+ "\U0001d705": "\\kappa ",
+ "\U0001d706": "\\lambda ",
+ "\U0001d707": "\\m ",
+ "\U0001d708": "\\n ",
+ "\U0001d709": "\\xi ",
+ "\U0001d70a": "\\omicron ",
+ "\U0001d70b": "\\pi ",
+ "\U0001d70c": "\\rho ",
+ "\U0001d70d": "\\varsigma ",
+ "\U0001d70e": "\\sigma ",
+ "\U0001d70f": "\\ta ",
+ "\U0001d710": "\\upsilon ",
+ "\U0001d711": "\\phi ",
+ "\U0001d712": "\\chi ",
+ "\U0001d713": "\\psi ",
+ "\U0001d714": "\\omega ",
+ "\U0001d715": "\\partial ",
+ "\U0001d716": "\\varepsilon ",
+ "\U0001d717": "\\vartheta ",
+ "\U0001d718": "\\varkappa ",
+ "\U0001d719": "\\varphi ",
+ "\U0001d71a": "\\varrho ",
+ "\U0001d71b": "\\varpi ",
+ # Relation symbols
+ "\u2190": "\\leftarrow ",
+ "\u2191": "\\uparrow ",
+ "\u2192": "\\rightarrow ",
+ "\u2193": "\\downright ",
+ "\u2194": "\\leftrightarrow ",
+ "\u2195": "\\updownarrow ",
+ "\u2196": "\\nwarrow ",
+ "\u2197": "\\nearrow ",
+ "\u2198": "\\searrow ",
+ "\u2199": "\\swarrow ",
+ "\u22ee": "\\vdots ",
+ "\u22ef": "\\cdots ",
+ "\u22f0": "\\adots ",
+ "\u22f1": "\\ddots ",
+ "\u2260": "\\ne ",
+ "\u2264": "\\leq ",
+ "\u2265": "\\geq ",
+ "\u2266": "\\leqq ",
+ "\u2267": "\\geqq ",
+ "\u2268": "\\lneqq ",
+ "\u2269": "\\gneqq ",
+ "\u226a": "\\ll ",
+ "\u226b": "\\gg ",
+ "\u2208": "\\in ",
+ "\u2209": "\\notin ",
+ "\u220b": "\\ni ",
+ "\u220c": "\\nni ",
+ # Ordinary symbols
+ "\u221e": "\\infty ",
+ # Binary relations
+ "\u00b1": "\\pm ",
+ "\u2213": "\\mp ",
+ # Italic, Latin, uppercase
+ "\U0001d434": "A",
+ "\U0001d435": "B",
+ "\U0001d436": "C",
+ "\U0001d437": "D",
+ "\U0001d438": "E",
+ "\U0001d439": "F",
+ "\U0001d43a": "G",
+ "\U0001d43b": "H",
+ "\U0001d43c": "I",
+ "\U0001d43d": "J",
+ "\U0001d43e": "K",
+ "\U0001d43f": "L",
+ "\U0001d440": "M",
+ "\U0001d441": "N",
+ "\U0001d442": "O",
+ "\U0001d443": "P",
+ "\U0001d444": "Q",
+ "\U0001d445": "R",
+ "\U0001d446": "S",
+ "\U0001d447": "T",
+ "\U0001d448": "U",
+ "\U0001d449": "V",
+ "\U0001d44a": "W",
+ "\U0001d44b": "X",
+ "\U0001d44c": "Y",
+ "\U0001d44d": "Z",
+ # Italic, Latin, lowercase
+ "\U0001d44e": "a",
+ "\U0001d44f": "b",
+ "\U0001d450": "c",
+ "\U0001d451": "d",
+ "\U0001d452": "e",
+ "\U0001d453": "f",
+ "\U0001d454": "g",
+ "\U0001d456": "i",
+ "\U0001d457": "j",
+ "\U0001d458": "k",
+ "\U0001d459": "l",
+ "\U0001d45a": "m",
+ "\U0001d45b": "n",
+ "\U0001d45c": "o",
+ "\U0001d45d": "p",
+ "\U0001d45e": "q",
+ "\U0001d45f": "r",
+ "\U0001d460": "s",
+ "\U0001d461": "t",
+ "\U0001d462": "u",
+ "\U0001d463": "v",
+ "\U0001d464": "w",
+ "\U0001d465": "x",
+ "\U0001d466": "y",
+ "\U0001d467": "z",
+}
+
+FUNC = {
+ "sin": "\\sin({fe})",
+ "cos": "\\cos({fe})",
+ "tan": "\\tan({fe})",
+ "arcsin": "\\arcsin({fe})",
+ "arccos": "\\arccos({fe})",
+ "arctan": "\\arctan({fe})",
+ "arccot": "\\arccot({fe})",
+ "sinh": "\\sinh({fe})",
+ "cosh": "\\cosh({fe})",
+ "tanh": "\\tanh({fe})",
+ "coth": "\\coth({fe})",
+ "sec": "\\sec({fe})",
+ "csc": "\\csc({fe})",
+}
+
+FUNC_PLACE = "{fe}"
+
+BRK = "\\\\"
+
+CHR_DEFAULT = {
+ "ACC_VAL": "\\hat{{{0}}}",
+}
+
+POS = {
+ "top": "\\overline{{{0}}}", # not sure
+ "bot": "\\underline{{{0}}}",
+}
+
+POS_DEFAULT = {
+ "BAR_VAL": "\\overline{{{0}}}",
+}
+
+SUB = "_{{{0}}}"
+
+SUP = "^{{{0}}}"
+
+F = {
+ "bar": "\\frac{{{num}}}{{{den}}}",
+ "skw": r"^{{{num}}}/_{{{den}}}",
+ "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
+ "lin": "{{{num}}}/{{{den}}}",
+}
+F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
+
+D = "\\left{left}{text}\\right{right}"
+
+D_DEFAULT = {
+ "left": "(",
+ "right": ")",
+ "null": ".",
+}
+
+RAD = "\\sqrt[{deg}]{{{text}}}"
+
+RAD_DEFAULT = "\\sqrt{{{text}}}"
+
+ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}"
+
+LIM_FUNC = {
+ "lim": "\\lim_{{{lim}}}",
+ "max": "\\max_{{{lim}}}",
+ "min": "\\min_{{{lim}}}",
+}
+
+LIM_TO = ("\\rightarrow", "\\to")
+
+LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
+
+M = "\\begin{{matrix}}{text}\\end{{matrix}}"
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py
new file mode 100644
index 0000000..03043a8
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py
@@ -0,0 +1,400 @@
+# -*- coding: utf-8 -*-
+
+"""
+Office Math Markup Language (OMML)
+Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
+On 25/03/2025
+"""
+
+import xml.etree.ElementTree as ET
+
+from .latex_dict import (
+ CHARS,
+ CHR,
+ CHR_BO,
+ CHR_DEFAULT,
+ POS,
+ POS_DEFAULT,
+ SUB,
+ SUP,
+ F,
+ F_DEFAULT,
+ T,
+ FUNC,
+ D,
+ D_DEFAULT,
+ RAD,
+ RAD_DEFAULT,
+ ARR,
+ LIM_FUNC,
+ LIM_TO,
+ LIM_UPP,
+ M,
+ BRK,
+ BLANK,
+ BACKSLASH,
+ ALN,
+ FUNC_PLACE,
+)
+
+OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
+
+
+def load(stream):
+ tree = ET.parse(stream)
+ for omath in tree.findall(OMML_NS + "oMath"):
+ yield oMath2Latex(omath)
+
+
+def load_string(string):
+ root = ET.fromstring(string)
+ for omath in root.findall(OMML_NS + "oMath"):
+ yield oMath2Latex(omath)
+
+
+def escape_latex(strs):
+ last = None
+ new_chr = []
+ strs = strs.replace(r"\\", "\\")
+ for c in strs:
+ if (c in CHARS) and (last != BACKSLASH):
+ new_chr.append(BACKSLASH + c)
+ else:
+ new_chr.append(c)
+ last = c
+ return BLANK.join(new_chr)
+
+
+def get_val(key, default=None, store=CHR):
+ if key is not None:
+ return key if not store else store.get(key, key)
+ else:
+ return default
+
+
+class Tag2Method(object):
+ def call_method(self, elm, stag=None):
+ getmethod = self.tag2meth.get
+ if stag is None:
+ stag = elm.tag.replace(OMML_NS, "")
+ method = getmethod(stag)
+ if method:
+ return method(self, elm)
+ else:
+ return None
+
+ def process_children_list(self, elm, include=None):
+ """
+ process children of the elm,return iterable
+ """
+ for _e in list(elm):
+ if OMML_NS not in _e.tag:
+ continue
+ stag = _e.tag.replace(OMML_NS, "")
+ if include and (stag not in include):
+ continue
+ t = self.call_method(_e, stag=stag)
+ if t is None:
+ t = self.process_unknow(_e, stag)
+ if t is None:
+ continue
+ yield (stag, t, _e)
+
+ def process_children_dict(self, elm, include=None):
+ """
+ process children of the elm,return dict
+ """
+ latex_chars = dict()
+ for stag, t, e in self.process_children_list(elm, include):
+ latex_chars[stag] = t
+ return latex_chars
+
+ def process_children(self, elm, include=None):
+ """
+ process children of the elm,return string
+ """
+ return BLANK.join(
+ (
+ t if not isinstance(t, Tag2Method) else str(t)
+ for stag, t, e in self.process_children_list(elm, include)
+ )
+ )
+
+ def process_unknow(self, elm, stag):
+ return None
+
+
+class Pr(Tag2Method):
+ text = ""
+
+ __val_tags = ("chr", "pos", "begChr", "endChr", "type")
+
+ __innerdict = None # can't use the __dict__
+
+ """ common properties of element"""
+
+ def __init__(self, elm):
+ self.__innerdict = {}
+ self.text = self.process_children(elm)
+
+ def __str__(self):
+ return self.text
+
+ def __unicode__(self):
+ return self.__str__(self)
+
+ def __getattr__(self, name):
+ return self.__innerdict.get(name, None)
+
+ def do_brk(self, elm):
+ self.__innerdict["brk"] = BRK
+ return BRK
+
+ def do_common(self, elm):
+ stag = elm.tag.replace(OMML_NS, "")
+ if stag in self.__val_tags:
+ t = elm.get("{0}val".format(OMML_NS))
+ self.__innerdict[stag] = t
+ return None
+
+ tag2meth = {
+ "brk": do_brk,
+ "chr": do_common,
+ "pos": do_common,
+ "begChr": do_common,
+ "endChr": do_common,
+ "type": do_common,
+ }
+
+
+class oMath2Latex(Tag2Method):
+ """
+ Convert oMath element of omml to latex
+ """
+
+ _t_dict = T
+
+ __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
+
+ def __init__(self, element):
+ self._latex = self.process_children(element)
+
+ def __str__(self):
+ return self.latex
+
+ def __unicode__(self):
+ return self.__str__(self)
+
+ def process_unknow(self, elm, stag):
+ if stag in self.__direct_tags:
+ return self.process_children(elm)
+ elif stag[-2:] == "Pr":
+ return Pr(elm)
+ else:
+ return None
+
+ @property
+ def latex(self):
+ return self._latex
+
+ def do_acc(self, elm):
+ """
+ the accent function
+ """
+ c_dict = self.process_children_dict(elm)
+ latex_s = get_val(
+ c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
+ )
+ return latex_s.format(c_dict["e"])
+
+ def do_bar(self, elm):
+ """
+ the bar function
+ """
+ c_dict = self.process_children_dict(elm)
+ pr = c_dict["barPr"]
+ latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
+ return pr.text + latex_s.format(c_dict["e"])
+
+ def do_d(self, elm):
+ """
+ the delimiter object
+ """
+ c_dict = self.process_children_dict(elm)
+ pr = c_dict["dPr"]
+ null = D_DEFAULT.get("null")
+ s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
+ e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
+ return pr.text + D.format(
+ left=null if not s_val else escape_latex(s_val),
+ text=c_dict["e"],
+ right=null if not e_val else escape_latex(e_val),
+ )
+
+ def do_spre(self, elm):
+ """
+ the Pre-Sub-Superscript object -- Not support yet
+ """
+ pass
+
+ def do_sub(self, elm):
+ text = self.process_children(elm)
+ return SUB.format(text)
+
+ def do_sup(self, elm):
+ text = self.process_children(elm)
+ return SUP.format(text)
+
+ def do_f(self, elm):
+ """
+ the fraction object
+ """
+ c_dict = self.process_children_dict(elm)
+ pr = c_dict["fPr"]
+ latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
+ return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
+
+ def do_func(self, elm):
+ """
+ the Function-Apply object (Examples:sin cos)
+ """
+ c_dict = self.process_children_dict(elm)
+ func_name = c_dict.get("fName")
+ return func_name.replace(FUNC_PLACE, c_dict.get("e"))
+
+ def do_fname(self, elm):
+ """
+ the func name
+ """
+ latex_chars = []
+ for stag, t, e in self.process_children_list(elm):
+ if stag == "r":
+ if FUNC.get(t):
+ latex_chars.append(FUNC[t])
+ else:
+ raise NotImplemented("Not support func %s" % t)
+ else:
+ latex_chars.append(t)
+ t = BLANK.join(latex_chars)
+ return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
+
+ def do_groupchr(self, elm):
+ """
+ the Group-Character object
+ """
+ c_dict = self.process_children_dict(elm)
+ pr = c_dict["groupChrPr"]
+ latex_s = get_val(pr.chr)
+ return pr.text + latex_s.format(c_dict["e"])
+
+ def do_rad(self, elm):
+ """
+ the radical object
+ """
+ c_dict = self.process_children_dict(elm)
+ text = c_dict.get("e")
+ deg_text = c_dict.get("deg")
+ if deg_text:
+ return RAD.format(deg=deg_text, text=text)
+ else:
+ return RAD_DEFAULT.format(text=text)
+
+ def do_eqarr(self, elm):
+ """
+ the Array object
+ """
+ return ARR.format(
+ text=BRK.join(
+ [t for stag, t, e in self.process_children_list(elm, include=("e",))]
+ )
+ )
+
+ def do_limlow(self, elm):
+ """
+ the Lower-Limit object
+ """
+ t_dict = self.process_children_dict(elm, include=("e", "lim"))
+ latex_s = LIM_FUNC.get(t_dict["e"])
+ if not latex_s:
+ raise NotImplemented("Not support lim %s" % t_dict["e"])
+ else:
+ return latex_s.format(lim=t_dict.get("lim"))
+
+ def do_limupp(self, elm):
+ """
+ the Upper-Limit object
+ """
+ t_dict = self.process_children_dict(elm, include=("e", "lim"))
+ return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
+
+ def do_lim(self, elm):
+ """
+ the lower limit of the limLow object and the upper limit of the limUpp function
+ """
+ return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
+
+ def do_m(self, elm):
+ """
+ the Matrix object
+ """
+ rows = []
+ for stag, t, e in self.process_children_list(elm):
+ if stag == "mPr":
+ pass
+ elif stag == "mr":
+ rows.append(t)
+ return M.format(text=BRK.join(rows))
+
+ def do_mr(self, elm):
+ """
+ a single row of the matrix m
+ """
+ return ALN.join(
+ [t for stag, t, e in self.process_children_list(elm, include=("e",))]
+ )
+
+ def do_nary(self, elm):
+ """
+ the n-ary object
+ """
+ res = []
+ bo = ""
+ for stag, t, e in self.process_children_list(elm):
+ if stag == "naryPr":
+ bo = get_val(t.chr, store=CHR_BO)
+ else:
+ res.append(t)
+ return bo + BLANK.join(res)
+
+ def do_r(self, elm):
+ """
+ Get text from 'r' element,And try convert them to latex symbols
+ @todo text style support , (sty)
+ @todo \text (latex pure text support)
+ """
+ _str = []
+ for s in elm.findtext("./{0}t".format(OMML_NS)):
+ # s = s if isinstance(s,unicode) else unicode(s,'utf-8')
+ _str.append(self._t_dict.get(s, s))
+ return escape_latex(BLANK.join(_str))
+
+ tag2meth = {
+ "acc": do_acc,
+ "r": do_r,
+ "bar": do_bar,
+ "sub": do_sub,
+ "sup": do_sup,
+ "f": do_f,
+ "func": do_func,
+ "fName": do_fname,
+ "groupChr": do_groupchr,
+ "d": do_d,
+ "rad": do_rad,
+ "eqArr": do_eqarr,
+ "limLow": do_limlow,
+ "limUpp": do_limupp,
+ "lim": do_lim,
+ "m": do_m,
+ "mr": do_mr,
+ "nary": do_nary,
+ }
diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
new file mode 100644
index 0000000..78552bc
--- /dev/null
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -0,0 +1,156 @@
+import zipfile
+from io import BytesIO
+from typing import BinaryIO
+from xml.etree import ElementTree as ET
+
+from bs4 import BeautifulSoup, Tag
+
+from .math.omml import OMML_NS, oMath2Latex
+
+MATH_ROOT_TEMPLATE = "".join(
+ (
+ "',
+ "{0}",
+ )
+)
+
+
+def _convert_omath_to_latex(tag: Tag) -> str:
+ """
+ Converts an OMML (Office Math Markup Language) tag to LaTeX format.
+
+ Args:
+ tag (Tag): A BeautifulSoup Tag object representing the OMML element.
+
+ Returns:
+ str: The LaTeX representation of the OMML element.
+ """
+ # Format the tag into a complete XML document string
+ math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag)))
+ # Find the 'oMath' element within the XML document
+ math_element = math_root.find(OMML_NS + "oMath")
+ # Convert the 'oMath' element to LaTeX using the oMath2Latex function
+ latex = oMath2Latex(math_element).latex
+ return latex
+
+
+def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag:
+ """
+ Creates a replacement tag for an OMML (Office Math Markup Language) element.
+
+ Args:
+ tag (Tag): A BeautifulSoup Tag object representing the "oMath" element.
+ block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False.
+
+ Returns:
+ Tag: A BeautifulSoup Tag object representing the replacement element.
+ """
+ t_tag = Tag(name="w:t")
+ t_tag.string = (
+ f"$${_convert_omath_to_latex(tag)}$$"
+ if block
+ else f"${_convert_omath_to_latex(tag)}$"
+ )
+ r_tag = Tag(name="w:r")
+ r_tag.append(t_tag)
+ return r_tag
+
+
+def _replace_equations(tag: Tag):
+ """
+ Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents.
+
+ Args:
+ tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath".
+
+ Raises:
+ ValueError: If the tag is not supported.
+ """
+ if tag.name == "oMathPara":
+ # Create a new paragraph tag
+ p_tag = Tag(name="w:p")
+ # Replace each 'oMath' child tag with its LaTeX equivalent as block equations
+ for child_tag in tag.find_all("oMath"):
+ p_tag.append(_get_omath_tag_replacement(child_tag, block=True))
+ # Replace the original 'oMathPara' tag with the new paragraph tag
+ tag.replace_with(p_tag)
+ elif tag.name == "oMath":
+ # Replace the 'oMath' tag with its LaTeX equivalent as inline equation
+ tag.replace_with(_get_omath_tag_replacement(tag, block=False))
+ else:
+ raise ValueError(f"Not supported tag: {tag.name}")
+
+
+def _pre_process_math(content: bytes) -> bytes:
+ """
+ Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX.
+ This preprocessed content can be directly replaced in the DOCX file -> XMLs.
+
+ Args:
+ content (bytes): The XML content of the DOCX file as bytes.
+
+ Returns:
+ bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
+ """
+ soup = BeautifulSoup(content.decode(), features="xml")
+ for tag in soup.find_all("oMathPara"):
+ _replace_equations(tag)
+ for tag in soup.find_all("oMath"):
+ _replace_equations(tag)
+ return str(soup).encode()
+
+
+def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
+ """
+ Pre-processes a DOCX file with provided steps.
+
+ The process works by unzipping the DOCX file in memory, transforming specific XML files
+ (such as converting OMML elements to LaTeX), and then zipping everything back into a
+ DOCX file without writing to disk.
+
+ Args:
+ input_docx (BinaryIO): A binary input stream representing the DOCX file.
+
+ Returns:
+ BinaryIO: A binary output stream representing the processed DOCX file.
+ """
+ output_docx = BytesIO()
+ # The files that need to be pre-processed from .docx
+ pre_process_enable_files = [
+ "word/document.xml",
+ "word/footnotes.xml",
+ "word/endnotes.xml",
+ ]
+ with zipfile.ZipFile(input_docx, mode="r") as zip_input:
+ files = {name: zip_input.read(name) for name in zip_input.namelist()}
+ with zipfile.ZipFile(output_docx, mode="w") as zip_output:
+ zip_output.comment = zip_input.comment
+ for name, content in files.items():
+ if name in pre_process_enable_files:
+ try:
+ # Pre-process the content
+ updated_content = _pre_process_math(content)
+ # In the future, if there are more pre-processing steps, they can be added here
+ zip_output.writestr(name, updated_content)
+ except:
+ # If there is an error in processing the content, write the original content
+ zip_output.writestr(name, content)
+ else:
+ zip_output.writestr(name, content)
+ output_docx.seek(0)
+ return output_docx
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index a9c469f..b320695 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -3,6 +3,7 @@ import sys
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
+from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
@@ -72,6 +73,8 @@ class DocxConverter(HtmlConverter):
)
style_map = kwargs.get("style_map", None)
+ pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
- mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs
+ mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
+ **kwargs,
)
diff --git a/packages/markitdown/tests/test_files/equations.docx b/packages/markitdown/tests/test_files/equations.docx
new file mode 100644
index 0000000000000000000000000000000000000000..6a05cd77f62aeb61624be36c05e12773177a8071
GIT binary patch
literal 15235
zcmeHuWpEuyvhEQxGfS3aF|#a|EM|+D$zo<^W(JE{7BgDR%*@QpPcwJ-&8&BCyomSr
z-agS4r@QLQrp&CYsxM_FKtNFe-~dPf06+v_jImNS0s;W?!2kdh03@)wpp~V)p{2dH
zf{V4GohGfbg*jmkC@}dC0Psiq|K0uvzk%BLVY6;}#IN@OZ+?^Y3*yb>`8|U+rMJj1Xq
zp0&<>B+ii=;|rJk;2`K^LjuGWk5fI_NRVKwPnt~2YQ#Sa(5rAJZ#L+u#(
z!cRGr5nSvutY15qffzCeU|xS)#8prG7Bj;U$wOG4@DNpX@0P2y&us0Td~>Kg|2g({
z@tVIa7PSau(Y?hWi1$OY-rqq0vj1Y9_%YZG=N~oFAJzx+VV>HyhURv3w11TUYnK0m
zZSil9UKZQ+VV1Cb=YFq#lWlTKU6{Gjbovv^n2QjQ>SB^8%L``nZ?9Yn3&5I(x+23<
z({U57cIm0rek6PX8lgYgn-t32Io2JE!ecZJEPYjVlj##zc5v_
z&~a?=n1|rx(N2_IN};>mLKxFe6yr1cBsIC2Hj4B2$(~F`^$1Yh;WtqqwEUHm9c}O^?wlgT}sSojh{Q?I`RG3803`
zh6mavNR*PJ9Tq`DI~sn5PkbkS-w$qbkNm
z#w|L`^O`igh5>6j@)9s;%gf71*y1jV;qGsOeZ90f+uYsVB`v>pnkSF0Ak5~0f3B`D
zsRk8?ArFW{Vy=2VeJcYCmgQP$j}-3=#CNZHP>7KVd+>
zRY>5(7xJbYaDpsw?m~qiMx#S-2bpF0Tw!L?jK2TL7f+NTi4A-c?T0P&z6P719hA!T
z81RHV9s(lxPZ(GaD@66vk_^sh8vr6fcJ_SsNL#jAiH;&jH9YWEnVIOFuBbZrJSXg6
z5pf-YAmg!UMt8K?p8pYeBr)dI5ZGr(`Zw`JO5mMqO3v+0B%QYv1b4@7hB_s1ke7>V
zpm67(xIigqYmgRbx)|UEY1td?t)F!1PQ4QEF=pJv-fv>=z4@7{G&Gb0=Ba;BA4{Yj
z&>r%L;YpOSBy@ToBL&S|=dOJDAvo!rpX8%)D&L6)UF~=*YSlPnX=f2*4+8@?WH4qI1@VMG*q1a{(fchYv?d}nZ>Avjg8E$Y_D;egk?l&^%0XX41Ta`+d7
z){(0?m*LI*kIdL#gA)CCBgS
z7^A`DwLz(?b63BhG`tl%k4BAgzA}ViAt7J`1w|#Z0mFcEqn%>05xLbVJlqvD85
zmQ~H2Nk+(fD>i|8l>xrflVXaUdwnCR$v+5P+@W0Gnanau-qOw=mBUQ6s0(s6MJ=z$
z$60M9ZyM_GFd6*G%q&q;R@^>5VcC-BMa1^24rfWE4WUAtm*5q%g$mM>qT$79>`<=m
z1@xV`4QuZ5O`bdec)asUOV!?JaY(G!c>V36RJ;&rM%+tVDpP@%{Ql5+4kOsyEpG3y2sH#?
zMf1KRx-+-Sa72tvi!QTS-6-vb%Ay$asy=Sit`czhK{4g#O{i)7L>ZV#Asy)H-l
z^5eRV1-P07EU=*3!AcE?p!aSQESb#IthkIHh}d;u=icdgY-AxGIgnv>WCk=(@{1v}aR`u6*(P>hrY1o0EjEfcM8;-Zcm3p`ryo>dwPte*H
zb@L_%!eIBJsI1pS-Zh4So<$<s6+V|bNjTWTKqMRcjV?ey&6VP2PLKkQd16B1JvqRi^_7^?)O>tk&i=Y8JNpEC
zj`;9C)6@0*@KDw6cF8Z=8>NT6KyH{q(+1BwhaqEQz>32u$WNdfNTbsP7%D<`Se5Wh
zWBK(auGCrtlw62!NW>kYKAkVvxlmoAbMh9uq?0r!Pf~2gem_Y-Z)QUyBot{bZ299o(N@A|aAewcgrFR3wSB%Qe(s6IJJ>>_
zBb(>D2R5KzziG-}0$sgiZY8jx$B1t*_p4Kl7G>H)_}f+en^F0;B5W%-X;i`~3w-LU
z)@SwA;l%O3t3QZ$Xx=}8Yxx2D<}R`}D_Jp{N!(yF<{OMdTg{nS
zwl(+{7U_L%Mt&6C=~aV_qQDXLcBDmR2i8N%rNEWY_<|nqhrh?*0|U`RpQGW}qE$=+
z@^j1eII5m~g`b`{!#?6sDQOGjA$e1!HXTR4G|s2h{c`K%^17zKBZJ$;_U-|rGU}C2
zW#>+3$9Pxk{o!_f-3#dL={9IyrlV@x1^xYKpHBPrt-qrf1H9ZStIhT8WYO+e
zOzY?dISj!TqK!37hVK{Vc$ivAZ{d?ul0cvy)ey%WJ4}93eg6BeIzOyv2z}5sAW|G#
z`wf~afqsrh!Tk~eN`o;6T7wk;apt2qlr_?vmSRz!gT5YUkQv)kUBQEtF}HJKYhB6e
zXFRKtGGyZe9~Jzh3GabQ1T=%0L1ci`5>=%;55i>>KV1yCXE@+9XBk9{3O5lCxU(jI
zt*%6he`7VbvqrEduPQ#zMnR#t2Pa`o2*xyOh2IAbRO+KAQQhG|^@-S-^o@Y?vm@d@
zB^KcK@?~!rTrq<2JCh>6PMkf3FhEKb{d_SJxDpg^6vNGV3{D?jo^FF`P?dF=0xJgz
zUtAgcV-H@=;t&FJxA_@gIIv}Wec8znyBZ=GJB4XEk#A-t3WhOYprCbBfVj6Ft6o^d
z_zWkzEeg@6(ApQV@i_%fM!$QH;tHE{)cpvr(S>$V@?9T;r&qzDJW90*Hx!f|;FLD{b;-
z2iu~){N4ke!G!woK+juWJ(hr;WHk5)qF$wyM1y>fwHIYheH?n4`fkBqfz+z(3c0HQ
zZpJsDSGI(xuUmR%_+KtON-|pgt5vsmHd(3CXbn+bE-_fS*`wfNxFo12hMV`QD<0S9
zSzMkUt5ciGjA%=-S_UABjHZBvwL!{h-D(r*@Ar&Dp*WCXeD9iQJsMY@<9tqL;Am>w
zO+rB)8`H-+A7PJ-JBuE0!#2phtI`Ydj4vML>&E#do2aww8x$L`39m1_r4#R+OU9<0
z(&Pc5wd>^-&xXaHW7pKU+aNVd=V`RH>nkA17aNFTQbtJR>Z6N(a*h*aD?1WVr=p|7
z|#3ys5c<#g_(@e7`g!)g|ec3cL||q=?aWx&OtTVJvC4m}%%D)qAVQWf48#pC*JubRNd(ekgepE*MBE{j@n#<(iiTKac%O(Y(ulG0tlBa3+CaEj$g@jztBVRbBI$%ejP>b8WN
z)hxem!^t*(B??xyEYWJzJfYPguenGP>QIgT;Ow%w40(b<=k^^DYtYzx9
zWYu@M=nf{qwwH#vvpiAa)ok$8p;csx{1c?atihSjOO*`zu+x3B*YT@IBW2ys
za8s4MW_VgN-PgDFo!yQO2o<$G-BBFo;Zf=G3F-!j0P&yGP0geUppqLXQ(pgD`u7>Jo8^oOusi>&E5~Srp}-J=UJqh
zxb>?apa8%L+}}pYc82!$rk2Kbe{_De%Ij9ktcacjmG4~_*SN`vp@+$u1K%~L=d{&OogfHsw_hgYb!B)ui6k1;qtXE2d%1A5RAvc
zEtXAFKk*A#2`Gr3BDm=IjO`7u$-#TH2g?SNqRthEkt43F3r65`F#h<>!eRCTntn{`
zaTiffEE)SPB2B&3gMD8P;(_BZa<28hY=WhiaEU$2b!}neag#EO7y&mD8MH-u^c%*$
zjqoZ8`k|U2J85^-sb46YF%HJ$+JUp*X5s8boSyNVP6d=6N$Dq7}@*
z4utE00^x3D@WBeFC19rEViXk|;peOOQpvgsRgQRV;VrG3?--2s>F3$q@X*-;HHh&?K^xt!C
zt2^X5!*y8=Dz}nfN7X5SVvd@;20(hE0%+^@Wn_u(`=Ng_$g|`fMx>|2seF^S2u9Rc
zN#vO6OXDE^>I243DEl3UoZ!<#*Ulq|FvrTY)lNg=1A%NH`t5l(H9?{V#TXB6DfiCP
zy!6z}^Fyoq=6B*G$IBEu2YH8B6Xd|@&+-wD!TT*SH3x#gWRP>GXPLy=Oes={ZK69;
zuMPyrqrmfNx1YYV!CF2=7av1(l^~?hkm<(X5QP;p-MTq7i@MXDMCfq9z4V-aHIlQQ
z##w$hW=?rL%VDcmM5cXyU7>!pW;B(l5z9O5NMn^ofmm_8t8pBA6P4t9wO-b-ZT$j;{%jQrl
zsyEAbUeOO9v?TR&1EnQmZDO8+lfPrgw<)%ER-x1^c4Y_1y^9`;Zn3R6tE*|5k
zvAqq)Q?*4}Txhu^bt}K(?2U1=-M*l>qucILqWnc-w@9svn3GnR3-~qmNb3*DbJ*JD
zuVmL{Cz9h0j2&@cO6d006GOkG(N$IR4Be6kfGheTFwq=~cfktd39R3;yS0N;Nd~w}
zE=roXvs;~sO@4mka1OsjSs7&~gMLd>ATd4=)1KAJ8&$UmjcipKw`VCxDBXvBk|@TD
z+&1nj=;;Xix_*S^W4nT{x}-U$EVU6YidlxXavTa1_Nq(I7HpTfdPEoVBSyrs8dB+$
z%SP~;`^U&hfZcm|7-=lK6~lLI)`E51euRS36kc0oGHjROP8^yW8Rirkynd+ye)`u2
z3;AcZ)2ILJ&aOJzuvJECCYOi3;~00T8t+qy&rm)G>8M3Qpq|0OO;C*?SZul0jY%T%{iQPFce4x-n
z#gtGTj7Yg1Pp`37Lyg|L_r)eD-@C>pK5sPw7;i62)DI)=S3qLE5N%OjIZ
z3Y^8K3dD_Cs3K~6hIKcKNI)aU4$i^EhnTG&C3iMk1fcjSOlXA;*@_+W;*v)IiZMdW
zp+BXEKZqCi%O!8pMsFKr3?{aFD-$M=v^5k&Po-6{7tb4px<*r&$pxqY%>;JrssRh%
zM@dA-(w0Tn2U7;p0Yz$)ss`C*=4jpFY3R0S(ke*WZ^G25YXv!`CNF=cSx@{7`xY*W
zO4wG_7!FE+D2j`2y6m7CBu&DvS3xi`i*#e4DDey0U61_p&$)Wj`fLbkn~v2l!Z_o+
zWJ`1Ys|vx=`k^rA7G^S-=^JYoWizm`-(}2sus6a1{F0wNshNq*vB=L#)orRn4D&fg
zE6Ani$mTK^KzKQs03@@Dko~isMf}AGDu$opZaCA|i1X~LdMIdHa!iAtLn8Q4+JLG!
zbmIFpdA~{8YEHetJL~#!Q^RXh&1i%D49!gRh`5_-IEicK*6cEa@m%m9`pI$a6bf^&
zLmjCF&w=4A(nU?o@v1?V#x`;{x|YwTVFT_n$XtBb7-Y699YN@7TQ
z2Yd7W?xYOw+p87t3O8Npb9hn==)my)@I&upPKnZZRMfC@S5lcfr4m;X@7UKWgM=5g
z6?{6?OT#u-1)AzE0T+%u+z9IR^efU`A@C`
z#)N*aXGXed;zt-`!`o^})#E6b+w9w<#cf_)pnB{MBle(fy1Z_Y`AP|$VyKPF?PkQLb#3^>
zDutZ+<;z|e^JWK)^IOx-pwTm(l|A7yT$RfH)wK!dq+EQ$;IS2hc5y3}*J#StO)lfW
z%-XTNowdc=I`+xT)SOYrhNJ_}e5#N3g
z=fsXIL@NFfpW0Fodvu`707$;er)Q)#UM^U^)u3%TV(flmR^~_%2@~8Xg$Z_i(jPWl
zK9PNwJq+0WCaMF$)Lqfp}M030!fGj4CI3t0Mq31jeN<55kK5g6h
z6IIl{X^c_Y|2F1x5A)o`5??ng9USw>#n2yrU~$KaGz=z#xeO62A$i0OIm>7kW>PxZD76~
zxph^l3soY%MLuO2o-^Y6qG3^njjxLX)G8JhD+zvD35xV02WO0yftq49J+uk8tOJ)c
zYpL|fA~S%*B)@q>JT7p3?WZ%*a5bG@%vvxtiW671U~2`muHI$B;_u^u^uNU~Ims5Sy>F+x4c0J+V^Bh(csGP%Ek;X`(^0
z|<}Q|t$M^sYzl$;Eb(RQxF&WeR>W{b(Tpf0CdS)6CeLZ0z&(7thZsF%v&L
zC>xsy^0$P7=gpfK{osKu23wuUt`WLSM7Ljq(27$ra*$`?ppuy^R4R-+uAQArp21-xZ(U_pBWA
z*}%Yrp7Jv@CK?=ecqj;*uz2Ui7kk{w&4xpEo$m8l1|1xt*x%0rMC9w0mAmj9bHLX{T3;-=L_-0@(Q*q;T#F`;^CcdpSCWB^
z&+bDiBN(3~oHsehI?BtEevay6wT#Y>9|Q5Np<&g9||TCMoFV4YN>}bfe;S%q(02{GUP~2eibWH&Yt^*QBhwVz8=*!9YK)ew00?YBhBX-td%nGJyKlEGk&dU5MlC*
z*dx)zBJ>?(qdC~A%_8(tkO5UNa%?$xIPeQ%TM&pRWUVX_s9F(-FtjIRysVE)v@8*{
zS`i{JjRK?(Iwh|6ra$zljX=O1kw5_XAG=Amz&{}WXk)gqu=X9FL?PH0ozkBV22J2c
zKuLcX$o~uW(H06V83GC|=SKiE(S$$%qDgkD2wZo-|KoE133n70@c(f=%}Y}{xhKmS
z-rjLn*YkyQufrkMBjtNJOANCaf)dLFg3|XI$Rq*x5H$iW+(Xw~0#VLk`Ru|X?xa~w
z4G#&>qhI^u%BwODjRzpw%?4Qk6Z4Wl4~;)blfPj(1+;#L3T?Gf&TC23nc+AaYOf$Y
z0PoinQ$DV4r}*k9(%DEg%HCIqe_dgInk_ekUbT4F#kBQl%*DRxifU^tT?n=Zl=6g5i=ldH;;oVnYxh`uCy
zq*5UWW;V(@ANv!i9GuRt;y~Fh1;LLRUA-Q8IHZqSGGkr=#96@SbTddaNfv+TTx>@}
zl@cQ0M6+P;d{>>IlK`kfYyq%uaM{0#0{y;F&msazun2X2fPIU=={G-w>68#DV9x#!
z7ykB#Kta%tCPcwz02JvDX$9B<4szwja`i>R3@6Fk;JT4S6@reiHiL@{utkZu$DL+_
z{E%gdgvu=Yjb?+Sqfrw{+`IzX*sqY}JZ<#Odatr3fv>W<3fm?5tR>w6Mg4b;(_bO^|J&*h>{1c_0+E~Uui`#MiHwhiq`V&@
ziX+T4>Sl!hDaYlXm499&HtSz1>*W6J4i5d8#h|o+9NG-NqD{8ZozYNXpy9mP{u+1b
zl8Eux6%%qRIhQkbac`)5N%%D6bB?U0P@h2!Lo4o<
z3!~R!Xk>GRWvWadon>slplEY-3WUWh^i+*Tv)z=-IOvuv*9v@ZH%n60HsQCrbTf7l!clgoH*
zqiRwTh0B=R*Er5=#8Z87-=}++7O|17^$7JLBVfLW)FH(Jr2G309G#msjFfXkd)!5d
z<{8va4}SY#S&kaZg`?e@dY^2@yTH)4n2^s3eNkKrv@4>;(>&p*Gz3qgVx#ZM1rCDS
zMmj?2Zp6RVeh8x=?XQannIv$vuQ1SqkEk_RFz5_Bq+%v>U18Frg+~BcL7J%IGE=1U
zetUNW5ul-~g9=JhpizI3zh@B`viu1E1-WlfqTrv
zmtce8k?Y3=Y?La)!p{T<>Oq;Eg4KF^OYUXqC{=oe5FG??G1oeKE$&>B!OoT6?7g5I
zCdLg2DH!6UdBh8Z;ul=1MJOY9GeDWHw~04>gF%mfb@h&uL8p{JA;*sPN6a~vn@N_`|0p84&NUN_@20!
z02RKE-_?ef#x!O8n&%ki_P%IDt!c&4@LK+*nRWRfX5Z7pplvB5$%j
zmJl&n*CIUD?m9zD38v^W3x<8>`4f496=$pFu2JjTOY*Z78Uz?7HTvcp
zJaX?(mBO0fRvNS7{jv${=})Q)R55<872F21igBY)#4YVpGFIlEYZ%=VXvAN<0-*;K
zv^VAF3ujtDEyx`sc3lG{?56I(+(y;k9c6G@(GfAoX^?!5$nQn^fvl#h+f8IFmP`9i
zV4`>^#<(?<3vdEvkV=Pz-dX0q-g@STleD}&5FDX@x#jOm8X{BkSUUBY-Wo-lf#8Xa
z_2I7Z7*{N^`do$^p($zq{RT|)TI!sU`SHDY`Uhbor=XLGQzdCEH`4Qg{dx*9+d1Ja
zhjmT=1huO6k)YvV3P;JU5=ET2W3Li(JBZ)F)7vm!)LhmO!8^o%nrisG_9SZc!40AL
zSOr1-=Tw7}q29N@ErO&^uUh^hLZFbIUJ
zN_%#`=b+U61k#G&)l5R(XPCO5chf#Z?$8Q4x`IeWuUpMmnu4I&lAXGET7hjngLZ98
zlW+sUDLsv4iscsO`;~};i+J6XzJ0I|T>){jQC{gRZ??gNMNLk1XG23JN=Zb9$#iVQ
z#1_nSkcN}`g`T>gyeM1XiQM8-`J1;XuZ}y|2>Z!1R#Z8^pwWCxP8QRbFY@;3?`YFN
zJrx)Xp6s(oX~}C<=@&UC{TXAyeHweJgBmUr=p|X~hr#_Y1<_V5pKq3H53#rooC8yh
zf8zKhmMcd8Dwq7#D~|oa&U@@BfYMzL@3%1Z6X+#8S^CtfUDi5t1)*e;MLh>k9nmdG
zQMSIWB@m>=b=6Dv@%sOrqp=RB%_93ja^{2j+nlb^M=pYbuAaH!p9GEg<;W#^#K3c^
zTfCeVItDQv)!ND<{36#$Ern$W`bfdDh|>XmX{+@rJVL5w8r3={unWD%WWN|SQ5tX?AfK@UIqV2}sWC`+SYOw97
z)N)egxqY(0dC?hY%E?Zj`ZoXi*#eVrBV2%6qQiz52XOck&=gZ=@}(wkU!EaJ^5SlA4uP@zPH;}Qi#7uuU)5QFXpJD=+1At-YI1YA
z+ituyxh>n`e6)Zgma4L6q-%HvSe5cciKM~j6jn~)`>8g;!2)e1ah&F|82@irn>t_d
zPdqPQ8^aJD$BxGmS;@a+{P=$DXqS7cAQ$pOCSrDDH;@v}J(vt9bTn7&-ak5>z2z5M
z6v{neX%WyF;wC@xPOP}xIRd3Q7$3Wdzu_$Zx-78<%qCYe?|F>~>XY^RXn+U7ib2a(fKw=84xbX0a1f)Jy
zI+5`GJDZ(BtgG6K3{Vk#*{00!vvJ&}#~NZEyC&i+jyHbjr$Cf0pE#67UWZ>8&!vm(
zk^)(agdWD-=b|^{KZ+0E-Ya?#6lkle52)&9t@?mS8jmYLs&gFyILS3vrpntkxgWNodi?dbHaZ2wfh54!#T^2Z$|pjMd9p$4{CNIYGj7*d
z)tNx2tZc^DKhE3vhcjv35c-ad$95Ld_@{5aXbHYxrk$~duI};Hc-KRnZNA>hZ_{4y2~Dw
zQ*iohphA=LXXkH{J
zfZ8X=%z6gq(ch=d{`%n`|6OxsQ9jApe`xN|M=j#tHCNZ#`VYVPzl!^zwI5r${IK)~
zF|w8T5<%{)eEQRmlKKi+sve)DnTbpqw>TB{dP5wc_e{8KqF@}J(-Bt@rtjTPxh#uc
z0*f*zj7{>tEzOm_Y=EkaiGLdi-8PGZTd(qYLITPGeJqi=tb*}@>j}Eec-1*M@u?C?8q4!9MfJ&
z5|MGm6*q4~f_l_vGc6d(M_=d^IEr&Rv#W|!@F&nMz1z;Gek9flLn1;`wBQ)_umR6Dwg
z@QBd;`qYxIGVa$#;MBEva&V_7djD)Ruk?1)~`@@t)%UAcPm(L>3}%GA^gJ2YQ6Oww4ddr9%Mq@vrmvpQ~(-(MI%kbS)G
z|I$<-V49DB>py23{`pz{R{V$DLs^M`74Wb5P=CV$vOXO5-?F2A2mYQY@)xw@gBkj7
zX(PYG|CNdS7Z?D@fc*#jzu_hSF6nnN>R+;U;Qt$H>hB_cr}O+JV)ujI`scI!o!s*~
z{P*Sazu>jl|A7C?V*2m+-6D0{;%%#@9O+
M`p{5w_CJpP58I?#egFUf
literal 0
HcmV?d00001
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 33e2c44..1819183 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python3 -m pytest
import io
import os
+import re
import shutil
import openai
import pytest
@@ -262,6 +263,19 @@ def test_docx_comments() -> None:
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
+def test_docx_equations() -> None:
+ markitdown = MarkItDown()
+ docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")
+ result = markitdown.convert(docx_file)
+
+ # Check for inline equation m=1 (wrapped with single $) is present
+ assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found"
+
+ # Find block equations wrapped with double $$ and check if they are present
+ block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content)
+ assert block_equations, "No block equations found in the document."
+
+
def test_input_as_strings() -> None:
markitdown = MarkItDown()