From 3fcd48cdfc651cbf508071c8d2fb7d82aeb075de Mon Sep 17 00:00:00 2001 From: Sathindu <11785398+sathinduga@users.noreply.github.com> Date: Fri, 28 Mar 2025 18:36:38 -0400 Subject: [PATCH] feat: render math equations in .docx documents (#1160) * feat: math equation rendering in .docx files * fix: import fix on .docx pre processing * test: add test cases for docx equation rendering * docs: add ThirdPartyNotices.md * refactor: reformatted with black --- packages/markitdown/ThirdPartyNotices.md | 232 ++++++++++ packages/markitdown/pyproject.toml | 3 +- .../markitdown/converter_utils/__init__.py | 0 .../converter_utils/docx/__init__.py | 0 .../converter_utils/docx/math/__init__.py | 0 .../converter_utils/docx/math/latex_dict.py | 273 ++++++++++++ .../converter_utils/docx/math/omml.py | 400 ++++++++++++++++++ .../converter_utils/docx/pre_process.py | 156 +++++++ .../markitdown/converters/_docx_converter.py | 5 +- .../tests/test_files/equations.docx | Bin 0 -> 15235 bytes packages/markitdown/tests/test_module_misc.py | 14 + 11 files changed, 1081 insertions(+), 2 deletions(-) create mode 100644 packages/markitdown/ThirdPartyNotices.md create mode 100644 packages/markitdown/src/markitdown/converter_utils/__init__.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/__init__.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py create mode 100644 packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py create mode 100644 packages/markitdown/tests/test_files/equations.docx diff --git a/packages/markitdown/ThirdPartyNotices.md b/packages/markitdown/ThirdPartyNotices.md new file mode 100644 index 0000000..44edd8f --- /dev/null +++ b/packages/markitdown/ThirdPartyNotices.md @@ -0,0 +1,232 @@ +# THIRD-PARTY SOFTWARE NOTICES AND INFORMATION + +**Do Not Translate or Localize** + +This project incorporates components from the projects listed below. The original copyright notices and the licenses +under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly +granted herein, whether by implication, estoppel or otherwise. + +1.dwml (https://github.com/xiilei/dwml) + +dwml NOTICES AND INFORMATION BEGIN HERE + +----------------------------------------- + +NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including +placeholders for the copyright owner and year. + +NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented. +The following section summarizes these changes. The full details are available in the MarkItDown source code +repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160) + +This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which +lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code +according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of +the file is not used. + +----------------------------------------- + +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +----------------------------------------- +END OF dwml NOTICES AND INFORMATION \ No newline at end of file diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 9136108..79f67d2 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -38,6 +38,7 @@ all = [ "pandas", "openpyxl", "xlrd", + "lxml", "pdfminer.six", "olefile", "pydub", @@ -47,7 +48,7 @@ all = [ "azure-identity" ] pptx = ["python-pptx"] -docx = ["mammoth"] +docx = ["mammoth", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six"] diff --git a/packages/markitdown/src/markitdown/converter_utils/__init__.py b/packages/markitdown/src/markitdown/converter_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py new file mode 100644 index 0000000..9b47382 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py @@ -0,0 +1,273 @@ +# -*- coding: utf-8 -*- + +""" +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py +On 25/03/2025 +""" + +from __future__ import unicode_literals + +CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") + +BLANK = "" +BACKSLASH = "\\" +ALN = "&" + +CHR = { + # Unicode : Latex Math Symbols + # Top accents + "\u0300": "\\grave{{{0}}}", + "\u0301": "\\acute{{{0}}}", + "\u0302": "\\hat{{{0}}}", + "\u0303": "\\tilde{{{0}}}", + "\u0304": "\\bar{{{0}}}", + "\u0305": "\\overbar{{{0}}}", + "\u0306": "\\breve{{{0}}}", + "\u0307": "\\dot{{{0}}}", + "\u0308": "\\ddot{{{0}}}", + "\u0309": "\\ovhook{{{0}}}", + "\u030a": "\\ocirc{{{0}}}}", + "\u030c": "\\check{{{0}}}}", + "\u0310": "\\candra{{{0}}}", + "\u0312": "\\oturnedcomma{{{0}}}", + "\u0315": "\\ocommatopright{{{0}}}", + "\u031a": "\\droang{{{0}}}", + "\u0338": "\\not{{{0}}}", + "\u20d0": "\\leftharpoonaccent{{{0}}}", + "\u20d1": "\\rightharpoonaccent{{{0}}}", + "\u20d2": "\\vertoverlay{{{0}}}", + "\u20d6": "\\overleftarrow{{{0}}}", + "\u20d7": "\\vec{{{0}}}", + "\u20db": "\\dddot{{{0}}}", + "\u20dc": "\\ddddot{{{0}}}", + "\u20e1": "\\overleftrightarrow{{{0}}}", + "\u20e7": "\\annuity{{{0}}}", + "\u20e9": "\\widebridgeabove{{{0}}}", + "\u20f0": "\\asteraccent{{{0}}}", + # Bottom accents + "\u0330": "\\wideutilde{{{0}}}", + "\u0331": "\\underbar{{{0}}}", + "\u20e8": "\\threeunderdot{{{0}}}", + "\u20ec": "\\underrightharpoondown{{{0}}}", + "\u20ed": "\\underleftharpoondown{{{0}}}", + "\u20ee": "\\underledtarrow{{{0}}}", + "\u20ef": "\\underrightarrow{{{0}}}", + # Over | group + "\u23b4": "\\overbracket{{{0}}}", + "\u23dc": "\\overparen{{{0}}}", + "\u23de": "\\overbrace{{{0}}}", + # Under| group + "\u23b5": "\\underbracket{{{0}}}", + "\u23dd": "\\underparen{{{0}}}", + "\u23df": "\\underbrace{{{0}}}", +} + +CHR_BO = { + # Big operators, + "\u2140": "\\Bbbsum", + "\u220f": "\\prod", + "\u2210": "\\coprod", + "\u2211": "\\sum", + "\u222b": "\\int", + "\u22c0": "\\bigwedge", + "\u22c1": "\\bigvee", + "\u22c2": "\\bigcap", + "\u22c3": "\\bigcup", + "\u2a00": "\\bigodot", + "\u2a01": "\\bigoplus", + "\u2a02": "\\bigotimes", +} + +T = { + "\u2192": "\\rightarrow ", + # Greek letters + "\U0001d6fc": "\\alpha ", + "\U0001d6fd": "\\beta ", + "\U0001d6fe": "\\gamma ", + "\U0001d6ff": "\\theta ", + "\U0001d700": "\\epsilon ", + "\U0001d701": "\\zeta ", + "\U0001d702": "\\eta ", + "\U0001d703": "\\theta ", + "\U0001d704": "\\iota ", + "\U0001d705": "\\kappa ", + "\U0001d706": "\\lambda ", + "\U0001d707": "\\m ", + "\U0001d708": "\\n ", + "\U0001d709": "\\xi ", + "\U0001d70a": "\\omicron ", + "\U0001d70b": "\\pi ", + "\U0001d70c": "\\rho ", + "\U0001d70d": "\\varsigma ", + "\U0001d70e": "\\sigma ", + "\U0001d70f": "\\ta ", + "\U0001d710": "\\upsilon ", + "\U0001d711": "\\phi ", + "\U0001d712": "\\chi ", + "\U0001d713": "\\psi ", + "\U0001d714": "\\omega ", + "\U0001d715": "\\partial ", + "\U0001d716": "\\varepsilon ", + "\U0001d717": "\\vartheta ", + "\U0001d718": "\\varkappa ", + "\U0001d719": "\\varphi ", + "\U0001d71a": "\\varrho ", + "\U0001d71b": "\\varpi ", + # Relation symbols + "\u2190": "\\leftarrow ", + "\u2191": "\\uparrow ", + "\u2192": "\\rightarrow ", + "\u2193": "\\downright ", + "\u2194": "\\leftrightarrow ", + "\u2195": "\\updownarrow ", + "\u2196": "\\nwarrow ", + "\u2197": "\\nearrow ", + "\u2198": "\\searrow ", + "\u2199": "\\swarrow ", + "\u22ee": "\\vdots ", + "\u22ef": "\\cdots ", + "\u22f0": "\\adots ", + "\u22f1": "\\ddots ", + "\u2260": "\\ne ", + "\u2264": "\\leq ", + "\u2265": "\\geq ", + "\u2266": "\\leqq ", + "\u2267": "\\geqq ", + "\u2268": "\\lneqq ", + "\u2269": "\\gneqq ", + "\u226a": "\\ll ", + "\u226b": "\\gg ", + "\u2208": "\\in ", + "\u2209": "\\notin ", + "\u220b": "\\ni ", + "\u220c": "\\nni ", + # Ordinary symbols + "\u221e": "\\infty ", + # Binary relations + "\u00b1": "\\pm ", + "\u2213": "\\mp ", + # Italic, Latin, uppercase + "\U0001d434": "A", + "\U0001d435": "B", + "\U0001d436": "C", + "\U0001d437": "D", + "\U0001d438": "E", + "\U0001d439": "F", + "\U0001d43a": "G", + "\U0001d43b": "H", + "\U0001d43c": "I", + "\U0001d43d": "J", + "\U0001d43e": "K", + "\U0001d43f": "L", + "\U0001d440": "M", + "\U0001d441": "N", + "\U0001d442": "O", + "\U0001d443": "P", + "\U0001d444": "Q", + "\U0001d445": "R", + "\U0001d446": "S", + "\U0001d447": "T", + "\U0001d448": "U", + "\U0001d449": "V", + "\U0001d44a": "W", + "\U0001d44b": "X", + "\U0001d44c": "Y", + "\U0001d44d": "Z", + # Italic, Latin, lowercase + "\U0001d44e": "a", + "\U0001d44f": "b", + "\U0001d450": "c", + "\U0001d451": "d", + "\U0001d452": "e", + "\U0001d453": "f", + "\U0001d454": "g", + "\U0001d456": "i", + "\U0001d457": "j", + "\U0001d458": "k", + "\U0001d459": "l", + "\U0001d45a": "m", + "\U0001d45b": "n", + "\U0001d45c": "o", + "\U0001d45d": "p", + "\U0001d45e": "q", + "\U0001d45f": "r", + "\U0001d460": "s", + "\U0001d461": "t", + "\U0001d462": "u", + "\U0001d463": "v", + "\U0001d464": "w", + "\U0001d465": "x", + "\U0001d466": "y", + "\U0001d467": "z", +} + +FUNC = { + "sin": "\\sin({fe})", + "cos": "\\cos({fe})", + "tan": "\\tan({fe})", + "arcsin": "\\arcsin({fe})", + "arccos": "\\arccos({fe})", + "arctan": "\\arctan({fe})", + "arccot": "\\arccot({fe})", + "sinh": "\\sinh({fe})", + "cosh": "\\cosh({fe})", + "tanh": "\\tanh({fe})", + "coth": "\\coth({fe})", + "sec": "\\sec({fe})", + "csc": "\\csc({fe})", +} + +FUNC_PLACE = "{fe}" + +BRK = "\\\\" + +CHR_DEFAULT = { + "ACC_VAL": "\\hat{{{0}}}", +} + +POS = { + "top": "\\overline{{{0}}}", # not sure + "bot": "\\underline{{{0}}}", +} + +POS_DEFAULT = { + "BAR_VAL": "\\overline{{{0}}}", +} + +SUB = "_{{{0}}}" + +SUP = "^{{{0}}}" + +F = { + "bar": "\\frac{{{num}}}{{{den}}}", + "skw": r"^{{{num}}}/_{{{den}}}", + "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", + "lin": "{{{num}}}/{{{den}}}", +} +F_DEFAULT = "\\frac{{{num}}}{{{den}}}" + +D = "\\left{left}{text}\\right{right}" + +D_DEFAULT = { + "left": "(", + "right": ")", + "null": ".", +} + +RAD = "\\sqrt[{deg}]{{{text}}}" + +RAD_DEFAULT = "\\sqrt{{{text}}}" + +ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}" + +LIM_FUNC = { + "lim": "\\lim_{{{lim}}}", + "max": "\\max_{{{lim}}}", + "min": "\\min_{{{lim}}}", +} + +LIM_TO = ("\\rightarrow", "\\to") + +LIM_UPP = "\\overset{{{lim}}}{{{text}}}" + +M = "\\begin{{matrix}}{text}\\end{{matrix}}" diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py new file mode 100644 index 0000000..03043a8 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py @@ -0,0 +1,400 @@ +# -*- coding: utf-8 -*- + +""" +Office Math Markup Language (OMML) +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py +On 25/03/2025 +""" + +import xml.etree.ElementTree as ET + +from .latex_dict import ( + CHARS, + CHR, + CHR_BO, + CHR_DEFAULT, + POS, + POS_DEFAULT, + SUB, + SUP, + F, + F_DEFAULT, + T, + FUNC, + D, + D_DEFAULT, + RAD, + RAD_DEFAULT, + ARR, + LIM_FUNC, + LIM_TO, + LIM_UPP, + M, + BRK, + BLANK, + BACKSLASH, + ALN, + FUNC_PLACE, +) + +OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" + + +def load(stream): + tree = ET.parse(stream) + for omath in tree.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def load_string(string): + root = ET.fromstring(string) + for omath in root.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def escape_latex(strs): + last = None + new_chr = [] + strs = strs.replace(r"\\", "\\") + for c in strs: + if (c in CHARS) and (last != BACKSLASH): + new_chr.append(BACKSLASH + c) + else: + new_chr.append(c) + last = c + return BLANK.join(new_chr) + + +def get_val(key, default=None, store=CHR): + if key is not None: + return key if not store else store.get(key, key) + else: + return default + + +class Tag2Method(object): + def call_method(self, elm, stag=None): + getmethod = self.tag2meth.get + if stag is None: + stag = elm.tag.replace(OMML_NS, "") + method = getmethod(stag) + if method: + return method(self, elm) + else: + return None + + def process_children_list(self, elm, include=None): + """ + process children of the elm,return iterable + """ + for _e in list(elm): + if OMML_NS not in _e.tag: + continue + stag = _e.tag.replace(OMML_NS, "") + if include and (stag not in include): + continue + t = self.call_method(_e, stag=stag) + if t is None: + t = self.process_unknow(_e, stag) + if t is None: + continue + yield (stag, t, _e) + + def process_children_dict(self, elm, include=None): + """ + process children of the elm,return dict + """ + latex_chars = dict() + for stag, t, e in self.process_children_list(elm, include): + latex_chars[stag] = t + return latex_chars + + def process_children(self, elm, include=None): + """ + process children of the elm,return string + """ + return BLANK.join( + ( + t if not isinstance(t, Tag2Method) else str(t) + for stag, t, e in self.process_children_list(elm, include) + ) + ) + + def process_unknow(self, elm, stag): + return None + + +class Pr(Tag2Method): + text = "" + + __val_tags = ("chr", "pos", "begChr", "endChr", "type") + + __innerdict = None # can't use the __dict__ + + """ common properties of element""" + + def __init__(self, elm): + self.__innerdict = {} + self.text = self.process_children(elm) + + def __str__(self): + return self.text + + def __unicode__(self): + return self.__str__(self) + + def __getattr__(self, name): + return self.__innerdict.get(name, None) + + def do_brk(self, elm): + self.__innerdict["brk"] = BRK + return BRK + + def do_common(self, elm): + stag = elm.tag.replace(OMML_NS, "") + if stag in self.__val_tags: + t = elm.get("{0}val".format(OMML_NS)) + self.__innerdict[stag] = t + return None + + tag2meth = { + "brk": do_brk, + "chr": do_common, + "pos": do_common, + "begChr": do_common, + "endChr": do_common, + "type": do_common, + } + + +class oMath2Latex(Tag2Method): + """ + Convert oMath element of omml to latex + """ + + _t_dict = T + + __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") + + def __init__(self, element): + self._latex = self.process_children(element) + + def __str__(self): + return self.latex + + def __unicode__(self): + return self.__str__(self) + + def process_unknow(self, elm, stag): + if stag in self.__direct_tags: + return self.process_children(elm) + elif stag[-2:] == "Pr": + return Pr(elm) + else: + return None + + @property + def latex(self): + return self._latex + + def do_acc(self, elm): + """ + the accent function + """ + c_dict = self.process_children_dict(elm) + latex_s = get_val( + c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR + ) + return latex_s.format(c_dict["e"]) + + def do_bar(self, elm): + """ + the bar function + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["barPr"] + latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) + return pr.text + latex_s.format(c_dict["e"]) + + def do_d(self, elm): + """ + the delimiter object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["dPr"] + null = D_DEFAULT.get("null") + s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) + e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) + return pr.text + D.format( + left=null if not s_val else escape_latex(s_val), + text=c_dict["e"], + right=null if not e_val else escape_latex(e_val), + ) + + def do_spre(self, elm): + """ + the Pre-Sub-Superscript object -- Not support yet + """ + pass + + def do_sub(self, elm): + text = self.process_children(elm) + return SUB.format(text) + + def do_sup(self, elm): + text = self.process_children(elm) + return SUP.format(text) + + def do_f(self, elm): + """ + the fraction object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["fPr"] + latex_s = get_val(pr.type, default=F_DEFAULT, store=F) + return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) + + def do_func(self, elm): + """ + the Function-Apply object (Examples:sin cos) + """ + c_dict = self.process_children_dict(elm) + func_name = c_dict.get("fName") + return func_name.replace(FUNC_PLACE, c_dict.get("e")) + + def do_fname(self, elm): + """ + the func name + """ + latex_chars = [] + for stag, t, e in self.process_children_list(elm): + if stag == "r": + if FUNC.get(t): + latex_chars.append(FUNC[t]) + else: + raise NotImplemented("Not support func %s" % t) + else: + latex_chars.append(t) + t = BLANK.join(latex_chars) + return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this + + def do_groupchr(self, elm): + """ + the Group-Character object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["groupChrPr"] + latex_s = get_val(pr.chr) + return pr.text + latex_s.format(c_dict["e"]) + + def do_rad(self, elm): + """ + the radical object + """ + c_dict = self.process_children_dict(elm) + text = c_dict.get("e") + deg_text = c_dict.get("deg") + if deg_text: + return RAD.format(deg=deg_text, text=text) + else: + return RAD_DEFAULT.format(text=text) + + def do_eqarr(self, elm): + """ + the Array object + """ + return ARR.format( + text=BRK.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + ) + + def do_limlow(self, elm): + """ + the Lower-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + latex_s = LIM_FUNC.get(t_dict["e"]) + if not latex_s: + raise NotImplemented("Not support lim %s" % t_dict["e"]) + else: + return latex_s.format(lim=t_dict.get("lim")) + + def do_limupp(self, elm): + """ + the Upper-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) + + def do_lim(self, elm): + """ + the lower limit of the limLow object and the upper limit of the limUpp function + """ + return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) + + def do_m(self, elm): + """ + the Matrix object + """ + rows = [] + for stag, t, e in self.process_children_list(elm): + if stag == "mPr": + pass + elif stag == "mr": + rows.append(t) + return M.format(text=BRK.join(rows)) + + def do_mr(self, elm): + """ + a single row of the matrix m + """ + return ALN.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + + def do_nary(self, elm): + """ + the n-ary object + """ + res = [] + bo = "" + for stag, t, e in self.process_children_list(elm): + if stag == "naryPr": + bo = get_val(t.chr, store=CHR_BO) + else: + res.append(t) + return bo + BLANK.join(res) + + def do_r(self, elm): + """ + Get text from 'r' element,And try convert them to latex symbols + @todo text style support , (sty) + @todo \text (latex pure text support) + """ + _str = [] + for s in elm.findtext("./{0}t".format(OMML_NS)): + # s = s if isinstance(s,unicode) else unicode(s,'utf-8') + _str.append(self._t_dict.get(s, s)) + return escape_latex(BLANK.join(_str)) + + tag2meth = { + "acc": do_acc, + "r": do_r, + "bar": do_bar, + "sub": do_sub, + "sup": do_sup, + "f": do_f, + "func": do_func, + "fName": do_fname, + "groupChr": do_groupchr, + "d": do_d, + "rad": do_rad, + "eqArr": do_eqarr, + "limLow": do_limlow, + "limUpp": do_limupp, + "lim": do_lim, + "m": do_m, + "mr": do_mr, + "nary": do_nary, + } diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py new file mode 100644 index 0000000..78552bc --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -0,0 +1,156 @@ +import zipfile +from io import BytesIO +from typing import BinaryIO +from xml.etree import ElementTree as ET + +from bs4 import BeautifulSoup, Tag + +from .math.omml import OMML_NS, oMath2Latex + +MATH_ROOT_TEMPLATE = "".join( + ( + "', + "{0}", + ) +) + + +def _convert_omath_to_latex(tag: Tag) -> str: + """ + Converts an OMML (Office Math Markup Language) tag to LaTeX format. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the OMML element. + + Returns: + str: The LaTeX representation of the OMML element. + """ + # Format the tag into a complete XML document string + math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag))) + # Find the 'oMath' element within the XML document + math_element = math_root.find(OMML_NS + "oMath") + # Convert the 'oMath' element to LaTeX using the oMath2Latex function + latex = oMath2Latex(math_element).latex + return latex + + +def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag: + """ + Creates a replacement tag for an OMML (Office Math Markup Language) element. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the "oMath" element. + block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False. + + Returns: + Tag: A BeautifulSoup Tag object representing the replacement element. + """ + t_tag = Tag(name="w:t") + t_tag.string = ( + f"$${_convert_omath_to_latex(tag)}$$" + if block + else f"${_convert_omath_to_latex(tag)}$" + ) + r_tag = Tag(name="w:r") + r_tag.append(t_tag) + return r_tag + + +def _replace_equations(tag: Tag): + """ + Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath". + + Raises: + ValueError: If the tag is not supported. + """ + if tag.name == "oMathPara": + # Create a new paragraph tag + p_tag = Tag(name="w:p") + # Replace each 'oMath' child tag with its LaTeX equivalent as block equations + for child_tag in tag.find_all("oMath"): + p_tag.append(_get_omath_tag_replacement(child_tag, block=True)) + # Replace the original 'oMathPara' tag with the new paragraph tag + tag.replace_with(p_tag) + elif tag.name == "oMath": + # Replace the 'oMath' tag with its LaTeX equivalent as inline equation + tag.replace_with(_get_omath_tag_replacement(tag, block=False)) + else: + raise ValueError(f"Not supported tag: {tag.name}") + + +def _pre_process_math(content: bytes) -> bytes: + """ + Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX. + This preprocessed content can be directly replaced in the DOCX file -> XMLs. + + Args: + content (bytes): The XML content of the DOCX file as bytes. + + Returns: + bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes. + """ + soup = BeautifulSoup(content.decode(), features="xml") + for tag in soup.find_all("oMathPara"): + _replace_equations(tag) + for tag in soup.find_all("oMath"): + _replace_equations(tag) + return str(soup).encode() + + +def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: + """ + Pre-processes a DOCX file with provided steps. + + The process works by unzipping the DOCX file in memory, transforming specific XML files + (such as converting OMML elements to LaTeX), and then zipping everything back into a + DOCX file without writing to disk. + + Args: + input_docx (BinaryIO): A binary input stream representing the DOCX file. + + Returns: + BinaryIO: A binary output stream representing the processed DOCX file. + """ + output_docx = BytesIO() + # The files that need to be pre-processed from .docx + pre_process_enable_files = [ + "word/document.xml", + "word/footnotes.xml", + "word/endnotes.xml", + ] + with zipfile.ZipFile(input_docx, mode="r") as zip_input: + files = {name: zip_input.read(name) for name in zip_input.namelist()} + with zipfile.ZipFile(output_docx, mode="w") as zip_output: + zip_output.comment = zip_input.comment + for name, content in files.items(): + if name in pre_process_enable_files: + try: + # Pre-process the content + updated_content = _pre_process_math(content) + # In the future, if there are more pre-processing steps, they can be added here + zip_output.writestr(name, updated_content) + except: + # If there is an error in processing the content, write the original content + zip_output.writestr(name, content) + else: + zip_output.writestr(name, content) + output_docx.seek(0) + return output_docx diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index a9c469f..b320695 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -3,6 +3,7 @@ import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter +from ..converter_utils.docx.pre_process import pre_process_docx from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -72,6 +73,8 @@ class DocxConverter(HtmlConverter): ) style_map = kwargs.get("style_map", None) + pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( - mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs + mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, + **kwargs, ) diff --git a/packages/markitdown/tests/test_files/equations.docx b/packages/markitdown/tests/test_files/equations.docx new file mode 100644 index 0000000000000000000000000000000000000000..6a05cd77f62aeb61624be36c05e12773177a8071 GIT binary patch literal 15235 zcmeHuWpEuyvhEQxGfS3aF|#a|EM|+D$zo<^W(JE{7BgDR%*@QpPcwJ-&8&BCyomSr z-agS4r@QLQrp&CYsxM_FKtNFe-~dPf06+v_jImNS0s;W?!2kdh03@)wpp~V)p{2dH zf{V4GohGfbg*jmkC@}dC0Psiq|K0uvzk%BLVY6;}#IN@OZ+?^Y3*yb>`8|U+rMJj1Xq zp0&<>B+ii=;|rJk;2`K^LjuGWk5fI_NRVKwPnt~2YQ#Sa(5rAJZ#L+u#( z!cRGr5nSvutY15qffzCeU|xS)#8prG7Bj;U$wOG4@DNpX@0P2y&us0Td~>Kg|2g({ z@tVIa7PSau(Y?hWi1$OY-rqq0vj1Y9_%YZG=N~oFAJzx+VV>HyhURv3w11TUYnK0m zZSil9UKZQ+VV1Cb=YFq#lWlTKU6{Gjbovv^n2QjQ>SB^8%L``nZ?9Yn3&5I(x+23< z({U57cIm0rek6PX8&#lgYgn-t32Io2JE!ecZJEPYjVlj##zc5v_ z&~a?=n1|rx(N2_IN};>mLKxFe6yr1cBsIC2Hj4B2$(~F`^$1Yh;WtqqwEUHm9c}O^?wlgT}sSojh{Q?I`RG3803` zh6mavNR*PJ9Tq`DI~sn5PkbkS-w$qbkNm z#w|L`^O`igh5>6j@)9s;%gf71*y1jV;qGsOeZ90f+uYsVB`v>pnkSF0Ak5~0f3B`D zsRk8?ArFW{Vy=2VeJcYCmgQP$j}-3=#CNZHP>7KVd+> zRY>5(7xJbYaDpsw?m~qiMx#S-2bpF0Tw!L?jK2TL7f+NTi4A-c?T0P&z6P719hA!T z81RHV9s(lxPZ(GaD@66vk_^sh8vr6fcJ_SsNL#jAiH;&jH9YWEnVIOFuBbZrJSXg6 z5pf-YAmg!UMt8K?p8pYeBr)dI5ZGr(`Zw`JO5mMqO3v+0B%QYv1b4@7hB_s1ke7>V zpm67(xIigqYmgRbx)|UEY1td?t)F!1PQ4QEF=pJv-fv>=z4@7{G&Gb0=Ba;BA4{Yj z&>r%L;YpOSBy@ToBL&S|=dOJDAvo!rpX8%)D&L6)UF~=*YSlPnX=f2*4+8@?WH4qI1@VMG*q1a{(fchYv?d}nZ>Avjg8E$Y_D;egk?l&^%0XX41Ta`+d7 z){(0?m*LI*kIdL#gA)CCBgS z7^A`DwLz(?b63BhG`tl%k4BAgzA}ViAt7J`1w|#Z0mFcEqn%>05xLbVJlqvD85 zmQ~H2Nk+(fD>i|8l>xrflVXaUdwnCR$v+5P+@W0Gnanau-qOw=mBUQ6s0(s6MJ=z$ z$60M9ZyM_GFd6*G%q&q;R@^>5VcC-BMa1^24rfWE4WUAtm*5q%g$mM>qT$79>`<=m z1@xV`4QuZ5O`bdec)asUOV!?JaY(G!c>V36RJ;&rM%+tVDpP@%{Ql5+4kOsyEpG3y2sH#? zMf1KRx-+-Sa72tvi!QTS-6-vb%Ay$asy=Sit`czhK{4g#O{i)7L>ZV#Asy)H-l z^5eRV1-P07EU=*3!AcE?p!aSQESb#IthkIHh}d;u=icdgY-AxGIgnv>WCk=(@{1v}aR`u6*(P>hrY1o0EjEfcM8;-Zcm3p`ryo>dwPte*H zb@L_%!eIBJsI1pS-Zh4So<$<s6+V|bNjTWTKqMRcjV?ey&6VP2PLKkQd16B1JvqRi^_7^?)O>tk&i=Y8JNpEC zj`;9C)6@0*@KDw6cF8Z=8>NT6KyH{q(+1BwhaqEQz>32u$WNdfNTbsP7%D<`Se5Wh zWBK(auGCrtlw62!NW>kYKAkVvxlmoAbMh9uq?0r!Pf~2gem_Y-Z)QUyBot{bZ299o(N@A|aAewcgrFR3wSB%Qe(s6IJJ>>_ zBb(>D2R5KzziG-}0$sgiZY8jx$B1t*_p4Kl7G>H)_}f+en^F0;B5W%-X;i`~3w-LU z)@SwA;l%O3t3QZ$Xx=}8Yxx2D<}R`}D_Jp{N!(yF<{OMdTg{nS zwl(+{7U_L%Mt&6C=~aV_qQDXLcBDmR2i8N%rNEWY_<|nqhrh?*0|U`RpQGW}qE$=+ z@^j1eII5m~g`b`{!#?6sDQOGjA$e1!HXTR4G|s2h{c`K%^17zKBZJ$;_U-|rGU}C2 zW#>+3$9Pxk{o!_f-3#dL={9IyrlV@x1^xYKpHBPrt-qrf1H9ZStIhT8WYO+e zOzY?dISj!TqK!37hVK{Vc$ivAZ{d?ul0cvy)ey%WJ4}93eg6BeIzOyv2z}5sAW|G# z`wf~afqsrh!Tk~eN`o;6T7wk;apt2qlr_?vmSRz!gT5YUkQv)kUBQEtF}HJKYhB6e zXFRKtGGyZe9~Jzh3GabQ1T=%0L1ci`5>=%;55i>>KV1yCXE@+9XBk9{3O5lCxU(jI zt*%6he`7VbvqrEduPQ#zMnR#t2Pa`o2*xyOh2IAbRO+KAQQhG|^@-S-^o@Y?vm@d@ zB^KcK@?~!rTrq<2JCh>6PMkf3FhEKb{d_SJxDpg^6vNGV3{D?jo^FF`P?dF=0xJgz zUtAgcV-H@=;t&FJxA_@gIIv}Wec8znyBZ=GJB4XEk#A-t3WhOYprCbBfVj6Ft6o^d z_zWkzEeg@6(ApQV@i_%fM!$QH;tHE{)cpvr(S>$V@?9T;r&qzDJW90*Hx!f|;FLD{b;- z2iu~){N4ke!G!woK+juWJ(hr;WHk5)qF$wyM1y>fwHIYheH?n4`fkBqfz+z(3c0HQ zZpJsDSGI(xuUmR%_+KtON-|pgt5vsmHd(3CXbn+bE-_fS*`wfNxFo12hMV`QD<0S9 zSzMkUt5ciGjA%=-S_UABjHZBvwL!{h-D(r*@Ar&Dp*WCXeD9iQJsMY@<9tqL;Am>w zO+rB)8`H-+A7PJ-JBuE0!#2phtI`Ydj4vML>&E#do2aww8x$L`39m1_r4#R+OU9<0 z(&Pc5wd>^-&xXaHW7pKU+aNVd=V`RH>nkA17aNFTQbtJR>Z6N(a*h*aD?1WVr=p|7 z*JubRNd(ekgepE*MBE{j@n#<(iiTKac%O(Y(ulG0tlBa3+CaEj$g@jztBVRbBI$%ejP>b8WN z)hxem!^t*(B??xyEYWJzJfYPguenGP>QIgT;Ow%w40(b<=k^^DYtYzx9 zWYu@M=nf{qwwH#vvpiAa)ok$8p;csx{1c?atihSjOO*`zu+x3B*YT@IBW2ys za8s4MW_VgN-PgDFo!yQO2o<$G-BBFo;Zf=G3F-!j0P&yGP0geUppqLXQ(pgD`u7>Jo8^oOusi>&E5~Srp}-J=UJqh zxb>?apa8%L+}}pYc82!$rk2Kbe{_De%Ij9ktcacjmG4~_*SN`vp@+$u1K%~L=d{&OogfHsw_hgYb!B)ui6k1;qtXE2d%1A5RAvc zEtXAFKk*A#2`Gr3BDm=IjO`7u$-#TH2g?SNqRthEkt43F3r65`F#h<>!eRCTntn{` zaTiffEE)SPB2B&3gMD8P;(_BZa<28hY=WhiaEU$2b!}neag#EO7y&mD8MH-u^c%*$ zjqoZ8`k|U2J85^-sb46YF%HJ$+JUp*X5s8boSyNVP6d=6N$Dq7}@* z4utE00^x3D@WBeFC19rEViXk|;peOOQpvgsRgQRV;VrG3?--2s>F3$q@X*-;HHh&?K^xt!C zt2^X5!*y8=Dz}nfN7X5SVvd@;20(hE0%+^@Wn_u(`=Ng_$g|`fMx>|2seF^S2u9Rc zN#vO6OXDE^>I243DEl3UoZ!<#*Ulq|FvrTY)lNg=1A%NH`t5l(H9?{V#TXB6DfiCP zy!6z}^Fyoq=6B*G$IBEu2YH8B6Xd|@&+-wD!TT*SH3x#gWRP>GXPLy=Oes={ZK69; zuMPyrqrmfNx1YYV!CF2=7av1(l^~?hkm<(X5QP;p-MTq7i@MXDMCfq9z4V-aHIlQQ z##w$hW=?rL%VDcmM5cXyU7>!pW;B(l5z9O5NMn^ofmm_8t8pBA6P4t9wO-b-ZT$j;{%jQrl zsyEAbUeOO9v?TR&1EnQmZDO8+lfPrgw<)%ER-x1^c4Y_1y^9`;Zn3R6tE*|5k zvAqq)Q?*4}Txhu^bt}K(?2U1=-M*l>qucILqWnc-w@9svn3GnR3-~qmNb3*DbJ*JD zuVmL{Cz9h0j2&@cO6d006GOkG(N$IR4Be6kfGheTFwq=~cfktd39R3;yS0N;Nd~w} zE=roXvs;~sO@4mka1OsjSs7&~gMLd>ATd4=)1KAJ8&$UmjcipKw`VCxDBXvBk|@TD z+&1nj=;;Xix_*S^W4nT{x}-U$EVU6YidlxXavTa1_Nq(I7HpTfdPEoVBSyrs8dB+$ z%SP~;`^U&hfZcm|7-=lK6~lLI)`E51euRS36kc0oGHjROP8^yW8Rirkynd+ye)`u2 z3;AcZ)2ILJ&aOJzuvJECCYOi3;~00T8t+qy&rm)G>8M3Qpq|0OO;C*?SZul0jY%T%{iQPFce4x-n z#gtGTj7Yg1Pp`37Lyg|L_r)eD-@C>pK5sPw7;i62)DI)=S3qLE5N%OjIZ z3Y^8K3dD_Cs3K~6hIKcKNI)aU4$i^EhnTG&C3iMk1fcjSOlXA;*@_+W;*v)IiZMdW zp+BXEKZqCi%O!8pMsFKr3?{aFD-$M=v^5k&Po-6{7tb4px<*r&$pxqY%>;JrssRh% zM@dA-(w0Tn2U7;p0Yz$)ss`C*=4jpFY3R0S(ke*WZ^G25YXv!`CNF=cSx@{7`xY*W zO4wG_7!FE+D2j`2y6m7CBu&DvS3xi`i*#e4DDey0U61_p&$)Wj`fLbkn~v2l!Z_o+ zWJ`1Ys|vx=`k^rA7G^S-=^JYoWizm`-(}2sus6a1{F0wNshNq*vB=L#)orRn4D&fg zE6Ani$mTK^KzKQs03@@Dko~isMf}AGDu$opZaCA|i1X~LdMIdHa!iAtLn8Q4+JLG! zbmIFpdA~{8YEHetJL~#!Q^RXh&1i%D49!gRh`5_-IEicK*6cEa@m%m9`pI$a6bf^& zLmjCF&w=4A(nU?o@v1?V#x`;{x|YwTVFT_n$XtBb7-Y699YN@7TQ z2Yd7W?xYOw+p87t3O8Npb9hn==)my)@I&upPKnZZRMfC@S5lcfr4m;X@7UKWgM=5g z6?{6?OT#u-1)AzE0T+%u+z9IR^efU`A@C` z#)N*aXGXed;zt-`!`o^})#E6b+w9w<#cf_)pnB{MBle(fy1Z_Y`AP|$VyKPF?PkQLb#3^> zDutZ+<;z|e^JWK)^IOx-pwTm(l|A7yT$RfH)wK!dq+EQ$;IS2hc5y3}*J#StO)lfW z%-XTNowdc=I`+xT)SOYrhNJ_}e5#N3g z=fsXIL@NFfpW0Fodvu`707$;er)Q)#UM^U^)u3%TV(flmR^~_%2@~8Xg$Z_i(jPWl zK9PNwJq+0WCaMF$)Lqfp}M030!fGj4CI3t0Mq31jeN<55kK5g6h z6IIl{X^c_Y|2F1x5A)o`5??ng9USw>#n2yrU~$KaGz=z#xeO62A$i0OIm>7kW>PxZD76~ zxph^l3soY%MLuO2o-^Y6qG3^njjxLX)G8JhD+zvD35xV02WO0yftq49J+uk8tOJ)c zYpL|fA~S%*B)@q>JT7p3?WZ%*a5bG@%vvxtiW671U~2`muHI$B;_u^u^uNU~Ims5Sy>F+x4c0J+V^Bh(csGP%Ek;X`(^0 z|<}Q|t$M^sYzl$;Eb(RQxF&WeR>W{b(Tpf0CdS)6CeLZ0z&(7thZsF%v&L zC>xsy^0$P7=gpfK{osKu23wuUt`WLSM7Ljq(27$ra*$`?ppuy^R4R-+uAQArp21-xZ(U_pBWA z*}%Yrp7Jv@CK?=ecqj;*uz2Ui7kk{w&4xpEo$m8l1|1xt*x%0rMC9w0mAmj9bHLX{T3;-=L_-0@(Q*q;T#F`;^CcdpSCWB^ z&+bDiBN(3~oHsehI?BtEevay6wT#Y>9|Q5Np<&g9||TCMoFV4YN>}bfe;S%q(02{GUP~2eibWH&Yt^*QBhwVz8=*!9YK)ew00?YBhBX-td%nGJyKlEGk&dU5MlC* z*dx)zBJ>?(qdC~A%_8(tkO5UNa%?$xIPeQ%TM&pRWUVX_s9F(-FtjIRysVE)v@8*{ zS`i{JjRK?(Iwh|6ra$zljX=O1kw5_XAG=Amz&{}WXk)gqu=X9FL?PH0ozkBV22J2c zKuLcX$o~uW(H06V83GC|=SKiE(S$$%qDgkD2wZo-|KoE133n70@c(f=%}Y}{xhKmS z-rjLn*YkyQufrkMBjtNJOANCaf)dLFg3|XI$Rq*x5H$iW+(Xw~0#VLk`Ru|X?xa~w z4G#&>qhI^u%BwODjRzpw%?4Qk6Z4Wl4~;)blfPj(1+;#L3T?Gf&TC23nc+AaYOf$Y z0PoinQ$DV4r}*k9(%DEg%HCIqe_dgInk_ekUbT4F#kBQl%*DRxifU^tT?n=Zl=6g5i=ldH;;oVnYxh`uCy zq*5UWW;V(@ANv!i9GuRt;y~Fh1;LLRUA-Q8IHZqSGGkr=#96@SbTddaNfv+TTx>@} zl@cQ0M6+P;d{>>IlK`kfYyq%uaM{0#0{y;F&msazun2X2fPIU=={G-w>68#DV9x#! z7ykB#Kta%tCPcwz02JvDX$9B<4szwja`i>R3@6Fk;JT4S6@reiHiL@{utkZu$DL+_ z{E%gdgvu=Yjb?+Sqfrw{+`IzX*sqY}JZ<#Odatr3fv>W<3fm?5tR>w6Mg4b;(_bO^|J&*h>{1c_0+E~Uui`#MiHwhiq`V&@ ziX+T4>Sl!hDaYlXm499&HtSz1>*W6J4i5d8#h|o+9NG-NqD{8ZozYNXpy9mP{u+1b zl8Eux6%%qRIhQkbac`)5N%%D6bB?U0P@h2!Lo4o< z3!~R!Xk>GRWvWadon>slplEY-3WUWh^i+*Tv)z=-IOvuv*9v@ZH%n60HsQCrbTf7l!clgoH* zqiRwTh0B=R*Er5=#8Z87-=}++7O|17^$7JLBVfLW)FH(Jr2G309G#msjFfXkd)!5d z<{8va4}SY#S&kaZg`?e@dY^2@yTH)4n2^s3eNkKrv@4>;(>&p*Gz3qgVx#ZM1rCDS zMmj?2Zp6RVeh8x=?XQannIv$vuQ1SqkEk_RFz5_Bq+%v>U18Frg+~BcL7J%IGE=1U zetUNW5ul-~g9=JhpizI3zh@B`viu1E1-WlfqTrv zmtce8k?Y3=Y?La)!p{T<>Oq;Eg4KF^OYUXqC{=oe5FG??G1oeKE$&>B!OoT6?7g5I zCdLg2DH!6UdBh8Z;ul=1MJOY9GeDWHw~04>gF%mfb@h&uL8p{JA;*sPN6a~vn@N_`|0p84&NUN_@20! z02RKE-_?ef#x!O8n&%ki_P%IDt!c&4@LK+*nRWRfX5Z7pplvB5$%j zmJl&n*CIUD?m9zD38v^W3x<8>`4f496=$pFu2JjTOY*Z78Uz?7HTvcp zJaX?(mBO0fRvNS7{jv${=})Q)R55<872F21igBY)#4YVpGFIlEYZ%=VXvAN<0-*;K zv^VAF3ujtDEyx`sc3lG{?56I(+(y;k9c6G@(GfAoX^?!5$nQn^fvl#h+f8IFmP`9i zV4`>^#<(?<3vdEvkV=Pz-dX0q-g@STleD}&5FDX@x#jOm8X{BkSUUBY-Wo-lf#8Xa z_2I7Z7*{N^`do$^p($zq{RT|)TI!sU`SHDY`Uhbor=XLGQzdCEH`4Qg{dx*9+d1Ja zhjmT=1huO6k)YvV3P;JU5=ET2W3Li(JBZ)F)7vm!)LhmO!8^o%nrisG_9SZc!40AL zSOr1-=Tw7}q29N@ErO&^uUh^hLZFbIUJ zN_%#`=b+U61k#G&)l5R(XPCO5chf#Z?$8Q4x`IeWuUpMmnu4I&lAXGET7hjngLZ98 zlW+sUDLsv4iscsO`;~};i+J6XzJ0I|T>){jQC{gRZ??gNMNLk1XG23JN=Zb9$#iVQ z#1_nSkcN}`g`T>gyeM1XiQM8-`J1;XuZ}y|2>Z!1R#Z8^pwWCxP8QRbFY@;3?`YFN zJrx)Xp6s(oX~}C<=@&UC{TXAyeHweJgBmUr=p|X~hr#_Y1<_V5pKq3H53#rooC8yh zf8zKhmMcd8Dwq7#D~|oa&U@@BfYMzL@3%1Z6X+#8S^CtfUDi5t1)*e;MLh>k9nmdG zQMSIWB@m>=b=6Dv@%sOrqp=RB%_93ja^{2j+nlb^M=pYbuAaH!p9GEg<;W#^#K3c^ zTfCeVItDQv)!ND<{36#$Ern$W`bfdDh|>XmX{+@rJVL5w8r3={unWD%WWN|SQ5tX?AfK@UIqV2}sWC`+SYOw97 z)N)egxqY(0dC?hY%E?Zj`ZoXi*#eVrBV2%6qQiz52XOck&=gZ=@}(wkU!EaJ^5SlA4uP@zPH;}Qi#7uuU)5QFXpJD=+1At-YI1YA z+ituyxh>n`e6)Zgma4L6q-%HvSe5cciKM~j6jn~)`>8g;!2)e1ah&F|82@irn>t_d zPdqPQ8^aJD$BxGmS;@a+{P=$DXqS7cAQ$pOCSrDDH;@v}J(vt9bTn7&-ak5>z2z5M z6v{neX%WyF;wC@xPOP}xIRd3Q7$3Wdzu_$Zx-78<%qCYe?|F>~>XY^RXn+U7ib2a(fKw=84xbX0a1f)Jy zI+5`GJDZ(BtgG6K3{Vk#*{00!vvJ&}#~NZEyC&i+jyHbjr$Cf0pE#67UWZ>8&!vm( zk^)(agdWD-=b|^{KZ+0E-Ya?#6lkle52)&9t@?mS8jmYLs&gFyILS3vrpntkxgWNodi?dbHaZ2wfh54!#T^2Z$|pjMd9p$4{CNIYGj7*d z)tNx2tZc^DKhE3vhcjv35c-ad$95Ld_@{5aXbHYxrk$~duI};Hc-KRnZNA>hZ_{4y2~Dw zQ*iohphA=LXXkH{J zfZ8X=%z6gq(ch=d{`%n`|6OxsQ9jApe`xN|M=j#tHCNZ#`VYVPzl!^zwI5r${IK)~ zF|w8T5<%{)eEQRmlKKi+sve)DnTbpqw>TB{dP5wc_e{8KqF@}J(-Bt@rtjTPxh#uc z0*f*zj7{>tEzOm_Y=EkaiGLdi-8PGZTd(qYLITPGeJqi=tb*}@>j}Eec-1*M@u?C?8q4!9MfJ& z5|MGm6*q4~f_l_vGc6d(M_=d^IEr&Rv#W|!@F&nMz1z;Gek9flLn1;`wBQ)_umR6Dwg z@QBd;`qYxIGVa$#;MBEva&V_7djD)Ruk?1)~`@@t)%UAcPm(L>3}%GA^gJ2YQ6Oww4ddr9%Mq@vrmvpQ~(-(MI%kbS)G z|I$<-V49DB>py23{`pz{R{V$DLs^M`74Wb5P=CV$vOXO5-?F2A2mYQY@)xw@gBkj7 zX(PYG|CNdS7Z?D@fc*#jzu_hSF6nnN>R+;U;Qt$H>hB_cr}O+JV)ujI`scI!o!s*~ z{P*Sazu>jl|A7C?V*2m+-6D0{;%%#@9O+ M`p{5w_CJpP58I?#egFUf literal 0 HcmV?d00001 diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 33e2c44..1819183 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -m pytest import io import os +import re import shutil import openai import pytest @@ -262,6 +263,19 @@ def test_docx_comments() -> None: validate_strings(result, DOCX_COMMENT_TEST_STRINGS) +def test_docx_equations() -> None: + markitdown = MarkItDown() + docx_file = os.path.join(TEST_FILES_DIR, "equations.docx") + result = markitdown.convert(docx_file) + + # Check for inline equation m=1 (wrapped with single $) is present + assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" + + # Find block equations wrapped with double $$ and check if they are present + block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content) + assert block_equations, "No block equations found in the document." + + def test_input_as_strings() -> None: markitdown = MarkItDown()