diff --git a/packages/markitdown/ThirdPartyNotices.md b/packages/markitdown/ThirdPartyNotices.md new file mode 100644 index 0000000..44edd8f --- /dev/null +++ b/packages/markitdown/ThirdPartyNotices.md @@ -0,0 +1,232 @@ +# THIRD-PARTY SOFTWARE NOTICES AND INFORMATION + +**Do Not Translate or Localize** + +This project incorporates components from the projects listed below. The original copyright notices and the licenses +under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly +granted herein, whether by implication, estoppel or otherwise. + +1.dwml (https://github.com/xiilei/dwml) + +dwml NOTICES AND INFORMATION BEGIN HERE + +----------------------------------------- + +NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including +placeholders for the copyright owner and year. + +NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented. +The following section summarizes these changes. The full details are available in the MarkItDown source code +repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160) + +This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which +lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code +according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of +the file is not used. + +----------------------------------------- + +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +----------------------------------------- +END OF dwml NOTICES AND INFORMATION \ No newline at end of file diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 9136108..79f67d2 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -38,6 +38,7 @@ all = [ "pandas", "openpyxl", "xlrd", + "lxml", "pdfminer.six", "olefile", "pydub", @@ -47,7 +48,7 @@ all = [ "azure-identity" ] pptx = ["python-pptx"] -docx = ["mammoth"] +docx = ["mammoth", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] pdf = ["pdfminer.six"] diff --git a/packages/markitdown/src/markitdown/converter_utils/__init__.py b/packages/markitdown/src/markitdown/converter_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py new file mode 100644 index 0000000..9b47382 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/latex_dict.py @@ -0,0 +1,273 @@ +# -*- coding: utf-8 -*- + +""" +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py +On 25/03/2025 +""" + +from __future__ import unicode_literals + +CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") + +BLANK = "" +BACKSLASH = "\\" +ALN = "&" + +CHR = { + # Unicode : Latex Math Symbols + # Top accents + "\u0300": "\\grave{{{0}}}", + "\u0301": "\\acute{{{0}}}", + "\u0302": "\\hat{{{0}}}", + "\u0303": "\\tilde{{{0}}}", + "\u0304": "\\bar{{{0}}}", + "\u0305": "\\overbar{{{0}}}", + "\u0306": "\\breve{{{0}}}", + "\u0307": "\\dot{{{0}}}", + "\u0308": "\\ddot{{{0}}}", + "\u0309": "\\ovhook{{{0}}}", + "\u030a": "\\ocirc{{{0}}}}", + "\u030c": "\\check{{{0}}}}", + "\u0310": "\\candra{{{0}}}", + "\u0312": "\\oturnedcomma{{{0}}}", + "\u0315": "\\ocommatopright{{{0}}}", + "\u031a": "\\droang{{{0}}}", + "\u0338": "\\not{{{0}}}", + "\u20d0": "\\leftharpoonaccent{{{0}}}", + "\u20d1": "\\rightharpoonaccent{{{0}}}", + "\u20d2": "\\vertoverlay{{{0}}}", + "\u20d6": "\\overleftarrow{{{0}}}", + "\u20d7": "\\vec{{{0}}}", + "\u20db": "\\dddot{{{0}}}", + "\u20dc": "\\ddddot{{{0}}}", + "\u20e1": "\\overleftrightarrow{{{0}}}", + "\u20e7": "\\annuity{{{0}}}", + "\u20e9": "\\widebridgeabove{{{0}}}", + "\u20f0": "\\asteraccent{{{0}}}", + # Bottom accents + "\u0330": "\\wideutilde{{{0}}}", + "\u0331": "\\underbar{{{0}}}", + "\u20e8": "\\threeunderdot{{{0}}}", + "\u20ec": "\\underrightharpoondown{{{0}}}", + "\u20ed": "\\underleftharpoondown{{{0}}}", + "\u20ee": "\\underledtarrow{{{0}}}", + "\u20ef": "\\underrightarrow{{{0}}}", + # Over | group + "\u23b4": "\\overbracket{{{0}}}", + "\u23dc": "\\overparen{{{0}}}", + "\u23de": "\\overbrace{{{0}}}", + # Under| group + "\u23b5": "\\underbracket{{{0}}}", + "\u23dd": "\\underparen{{{0}}}", + "\u23df": "\\underbrace{{{0}}}", +} + +CHR_BO = { + # Big operators, + "\u2140": "\\Bbbsum", + "\u220f": "\\prod", + "\u2210": "\\coprod", + "\u2211": "\\sum", + "\u222b": "\\int", + "\u22c0": "\\bigwedge", + "\u22c1": "\\bigvee", + "\u22c2": "\\bigcap", + "\u22c3": "\\bigcup", + "\u2a00": "\\bigodot", + "\u2a01": "\\bigoplus", + "\u2a02": "\\bigotimes", +} + +T = { + "\u2192": "\\rightarrow ", + # Greek letters + "\U0001d6fc": "\\alpha ", + "\U0001d6fd": "\\beta ", + "\U0001d6fe": "\\gamma ", + "\U0001d6ff": "\\theta ", + "\U0001d700": "\\epsilon ", + "\U0001d701": "\\zeta ", + "\U0001d702": "\\eta ", + "\U0001d703": "\\theta ", + "\U0001d704": "\\iota ", + "\U0001d705": "\\kappa ", + "\U0001d706": "\\lambda ", + "\U0001d707": "\\m ", + "\U0001d708": "\\n ", + "\U0001d709": "\\xi ", + "\U0001d70a": "\\omicron ", + "\U0001d70b": "\\pi ", + "\U0001d70c": "\\rho ", + "\U0001d70d": "\\varsigma ", + "\U0001d70e": "\\sigma ", + "\U0001d70f": "\\ta ", + "\U0001d710": "\\upsilon ", + "\U0001d711": "\\phi ", + "\U0001d712": "\\chi ", + "\U0001d713": "\\psi ", + "\U0001d714": "\\omega ", + "\U0001d715": "\\partial ", + "\U0001d716": "\\varepsilon ", + "\U0001d717": "\\vartheta ", + "\U0001d718": "\\varkappa ", + "\U0001d719": "\\varphi ", + "\U0001d71a": "\\varrho ", + "\U0001d71b": "\\varpi ", + # Relation symbols + "\u2190": "\\leftarrow ", + "\u2191": "\\uparrow ", + "\u2192": "\\rightarrow ", + "\u2193": "\\downright ", + "\u2194": "\\leftrightarrow ", + "\u2195": "\\updownarrow ", + "\u2196": "\\nwarrow ", + "\u2197": "\\nearrow ", + "\u2198": "\\searrow ", + "\u2199": "\\swarrow ", + "\u22ee": "\\vdots ", + "\u22ef": "\\cdots ", + "\u22f0": "\\adots ", + "\u22f1": "\\ddots ", + "\u2260": "\\ne ", + "\u2264": "\\leq ", + "\u2265": "\\geq ", + "\u2266": "\\leqq ", + "\u2267": "\\geqq ", + "\u2268": "\\lneqq ", + "\u2269": "\\gneqq ", + "\u226a": "\\ll ", + "\u226b": "\\gg ", + "\u2208": "\\in ", + "\u2209": "\\notin ", + "\u220b": "\\ni ", + "\u220c": "\\nni ", + # Ordinary symbols + "\u221e": "\\infty ", + # Binary relations + "\u00b1": "\\pm ", + "\u2213": "\\mp ", + # Italic, Latin, uppercase + "\U0001d434": "A", + "\U0001d435": "B", + "\U0001d436": "C", + "\U0001d437": "D", + "\U0001d438": "E", + "\U0001d439": "F", + "\U0001d43a": "G", + "\U0001d43b": "H", + "\U0001d43c": "I", + "\U0001d43d": "J", + "\U0001d43e": "K", + "\U0001d43f": "L", + "\U0001d440": "M", + "\U0001d441": "N", + "\U0001d442": "O", + "\U0001d443": "P", + "\U0001d444": "Q", + "\U0001d445": "R", + "\U0001d446": "S", + "\U0001d447": "T", + "\U0001d448": "U", + "\U0001d449": "V", + "\U0001d44a": "W", + "\U0001d44b": "X", + "\U0001d44c": "Y", + "\U0001d44d": "Z", + # Italic, Latin, lowercase + "\U0001d44e": "a", + "\U0001d44f": "b", + "\U0001d450": "c", + "\U0001d451": "d", + "\U0001d452": "e", + "\U0001d453": "f", + "\U0001d454": "g", + "\U0001d456": "i", + "\U0001d457": "j", + "\U0001d458": "k", + "\U0001d459": "l", + "\U0001d45a": "m", + "\U0001d45b": "n", + "\U0001d45c": "o", + "\U0001d45d": "p", + "\U0001d45e": "q", + "\U0001d45f": "r", + "\U0001d460": "s", + "\U0001d461": "t", + "\U0001d462": "u", + "\U0001d463": "v", + "\U0001d464": "w", + "\U0001d465": "x", + "\U0001d466": "y", + "\U0001d467": "z", +} + +FUNC = { + "sin": "\\sin({fe})", + "cos": "\\cos({fe})", + "tan": "\\tan({fe})", + "arcsin": "\\arcsin({fe})", + "arccos": "\\arccos({fe})", + "arctan": "\\arctan({fe})", + "arccot": "\\arccot({fe})", + "sinh": "\\sinh({fe})", + "cosh": "\\cosh({fe})", + "tanh": "\\tanh({fe})", + "coth": "\\coth({fe})", + "sec": "\\sec({fe})", + "csc": "\\csc({fe})", +} + +FUNC_PLACE = "{fe}" + +BRK = "\\\\" + +CHR_DEFAULT = { + "ACC_VAL": "\\hat{{{0}}}", +} + +POS = { + "top": "\\overline{{{0}}}", # not sure + "bot": "\\underline{{{0}}}", +} + +POS_DEFAULT = { + "BAR_VAL": "\\overline{{{0}}}", +} + +SUB = "_{{{0}}}" + +SUP = "^{{{0}}}" + +F = { + "bar": "\\frac{{{num}}}{{{den}}}", + "skw": r"^{{{num}}}/_{{{den}}}", + "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", + "lin": "{{{num}}}/{{{den}}}", +} +F_DEFAULT = "\\frac{{{num}}}{{{den}}}" + +D = "\\left{left}{text}\\right{right}" + +D_DEFAULT = { + "left": "(", + "right": ")", + "null": ".", +} + +RAD = "\\sqrt[{deg}]{{{text}}}" + +RAD_DEFAULT = "\\sqrt{{{text}}}" + +ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}" + +LIM_FUNC = { + "lim": "\\lim_{{{lim}}}", + "max": "\\max_{{{lim}}}", + "min": "\\min_{{{lim}}}", +} + +LIM_TO = ("\\rightarrow", "\\to") + +LIM_UPP = "\\overset{{{lim}}}{{{text}}}" + +M = "\\begin{{matrix}}{text}\\end{{matrix}}" diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py new file mode 100644 index 0000000..03043a8 --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py @@ -0,0 +1,400 @@ +# -*- coding: utf-8 -*- + +""" +Office Math Markup Language (OMML) +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py +On 25/03/2025 +""" + +import xml.etree.ElementTree as ET + +from .latex_dict import ( + CHARS, + CHR, + CHR_BO, + CHR_DEFAULT, + POS, + POS_DEFAULT, + SUB, + SUP, + F, + F_DEFAULT, + T, + FUNC, + D, + D_DEFAULT, + RAD, + RAD_DEFAULT, + ARR, + LIM_FUNC, + LIM_TO, + LIM_UPP, + M, + BRK, + BLANK, + BACKSLASH, + ALN, + FUNC_PLACE, +) + +OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" + + +def load(stream): + tree = ET.parse(stream) + for omath in tree.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def load_string(string): + root = ET.fromstring(string) + for omath in root.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def escape_latex(strs): + last = None + new_chr = [] + strs = strs.replace(r"\\", "\\") + for c in strs: + if (c in CHARS) and (last != BACKSLASH): + new_chr.append(BACKSLASH + c) + else: + new_chr.append(c) + last = c + return BLANK.join(new_chr) + + +def get_val(key, default=None, store=CHR): + if key is not None: + return key if not store else store.get(key, key) + else: + return default + + +class Tag2Method(object): + def call_method(self, elm, stag=None): + getmethod = self.tag2meth.get + if stag is None: + stag = elm.tag.replace(OMML_NS, "") + method = getmethod(stag) + if method: + return method(self, elm) + else: + return None + + def process_children_list(self, elm, include=None): + """ + process children of the elm,return iterable + """ + for _e in list(elm): + if OMML_NS not in _e.tag: + continue + stag = _e.tag.replace(OMML_NS, "") + if include and (stag not in include): + continue + t = self.call_method(_e, stag=stag) + if t is None: + t = self.process_unknow(_e, stag) + if t is None: + continue + yield (stag, t, _e) + + def process_children_dict(self, elm, include=None): + """ + process children of the elm,return dict + """ + latex_chars = dict() + for stag, t, e in self.process_children_list(elm, include): + latex_chars[stag] = t + return latex_chars + + def process_children(self, elm, include=None): + """ + process children of the elm,return string + """ + return BLANK.join( + ( + t if not isinstance(t, Tag2Method) else str(t) + for stag, t, e in self.process_children_list(elm, include) + ) + ) + + def process_unknow(self, elm, stag): + return None + + +class Pr(Tag2Method): + text = "" + + __val_tags = ("chr", "pos", "begChr", "endChr", "type") + + __innerdict = None # can't use the __dict__ + + """ common properties of element""" + + def __init__(self, elm): + self.__innerdict = {} + self.text = self.process_children(elm) + + def __str__(self): + return self.text + + def __unicode__(self): + return self.__str__(self) + + def __getattr__(self, name): + return self.__innerdict.get(name, None) + + def do_brk(self, elm): + self.__innerdict["brk"] = BRK + return BRK + + def do_common(self, elm): + stag = elm.tag.replace(OMML_NS, "") + if stag in self.__val_tags: + t = elm.get("{0}val".format(OMML_NS)) + self.__innerdict[stag] = t + return None + + tag2meth = { + "brk": do_brk, + "chr": do_common, + "pos": do_common, + "begChr": do_common, + "endChr": do_common, + "type": do_common, + } + + +class oMath2Latex(Tag2Method): + """ + Convert oMath element of omml to latex + """ + + _t_dict = T + + __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") + + def __init__(self, element): + self._latex = self.process_children(element) + + def __str__(self): + return self.latex + + def __unicode__(self): + return self.__str__(self) + + def process_unknow(self, elm, stag): + if stag in self.__direct_tags: + return self.process_children(elm) + elif stag[-2:] == "Pr": + return Pr(elm) + else: + return None + + @property + def latex(self): + return self._latex + + def do_acc(self, elm): + """ + the accent function + """ + c_dict = self.process_children_dict(elm) + latex_s = get_val( + c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR + ) + return latex_s.format(c_dict["e"]) + + def do_bar(self, elm): + """ + the bar function + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["barPr"] + latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) + return pr.text + latex_s.format(c_dict["e"]) + + def do_d(self, elm): + """ + the delimiter object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["dPr"] + null = D_DEFAULT.get("null") + s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) + e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) + return pr.text + D.format( + left=null if not s_val else escape_latex(s_val), + text=c_dict["e"], + right=null if not e_val else escape_latex(e_val), + ) + + def do_spre(self, elm): + """ + the Pre-Sub-Superscript object -- Not support yet + """ + pass + + def do_sub(self, elm): + text = self.process_children(elm) + return SUB.format(text) + + def do_sup(self, elm): + text = self.process_children(elm) + return SUP.format(text) + + def do_f(self, elm): + """ + the fraction object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["fPr"] + latex_s = get_val(pr.type, default=F_DEFAULT, store=F) + return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) + + def do_func(self, elm): + """ + the Function-Apply object (Examples:sin cos) + """ + c_dict = self.process_children_dict(elm) + func_name = c_dict.get("fName") + return func_name.replace(FUNC_PLACE, c_dict.get("e")) + + def do_fname(self, elm): + """ + the func name + """ + latex_chars = [] + for stag, t, e in self.process_children_list(elm): + if stag == "r": + if FUNC.get(t): + latex_chars.append(FUNC[t]) + else: + raise NotImplemented("Not support func %s" % t) + else: + latex_chars.append(t) + t = BLANK.join(latex_chars) + return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this + + def do_groupchr(self, elm): + """ + the Group-Character object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["groupChrPr"] + latex_s = get_val(pr.chr) + return pr.text + latex_s.format(c_dict["e"]) + + def do_rad(self, elm): + """ + the radical object + """ + c_dict = self.process_children_dict(elm) + text = c_dict.get("e") + deg_text = c_dict.get("deg") + if deg_text: + return RAD.format(deg=deg_text, text=text) + else: + return RAD_DEFAULT.format(text=text) + + def do_eqarr(self, elm): + """ + the Array object + """ + return ARR.format( + text=BRK.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + ) + + def do_limlow(self, elm): + """ + the Lower-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + latex_s = LIM_FUNC.get(t_dict["e"]) + if not latex_s: + raise NotImplemented("Not support lim %s" % t_dict["e"]) + else: + return latex_s.format(lim=t_dict.get("lim")) + + def do_limupp(self, elm): + """ + the Upper-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) + + def do_lim(self, elm): + """ + the lower limit of the limLow object and the upper limit of the limUpp function + """ + return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) + + def do_m(self, elm): + """ + the Matrix object + """ + rows = [] + for stag, t, e in self.process_children_list(elm): + if stag == "mPr": + pass + elif stag == "mr": + rows.append(t) + return M.format(text=BRK.join(rows)) + + def do_mr(self, elm): + """ + a single row of the matrix m + """ + return ALN.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + + def do_nary(self, elm): + """ + the n-ary object + """ + res = [] + bo = "" + for stag, t, e in self.process_children_list(elm): + if stag == "naryPr": + bo = get_val(t.chr, store=CHR_BO) + else: + res.append(t) + return bo + BLANK.join(res) + + def do_r(self, elm): + """ + Get text from 'r' element,And try convert them to latex symbols + @todo text style support , (sty) + @todo \text (latex pure text support) + """ + _str = [] + for s in elm.findtext("./{0}t".format(OMML_NS)): + # s = s if isinstance(s,unicode) else unicode(s,'utf-8') + _str.append(self._t_dict.get(s, s)) + return escape_latex(BLANK.join(_str)) + + tag2meth = { + "acc": do_acc, + "r": do_r, + "bar": do_bar, + "sub": do_sub, + "sup": do_sup, + "f": do_f, + "func": do_func, + "fName": do_fname, + "groupChr": do_groupchr, + "d": do_d, + "rad": do_rad, + "eqArr": do_eqarr, + "limLow": do_limlow, + "limUpp": do_limupp, + "lim": do_lim, + "m": do_m, + "mr": do_mr, + "nary": do_nary, + } diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py new file mode 100644 index 0000000..78552bc --- /dev/null +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -0,0 +1,156 @@ +import zipfile +from io import BytesIO +from typing import BinaryIO +from xml.etree import ElementTree as ET + +from bs4 import BeautifulSoup, Tag + +from .math.omml import OMML_NS, oMath2Latex + +MATH_ROOT_TEMPLATE = "".join( + ( + "', + "{0}", + ) +) + + +def _convert_omath_to_latex(tag: Tag) -> str: + """ + Converts an OMML (Office Math Markup Language) tag to LaTeX format. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the OMML element. + + Returns: + str: The LaTeX representation of the OMML element. + """ + # Format the tag into a complete XML document string + math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag))) + # Find the 'oMath' element within the XML document + math_element = math_root.find(OMML_NS + "oMath") + # Convert the 'oMath' element to LaTeX using the oMath2Latex function + latex = oMath2Latex(math_element).latex + return latex + + +def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag: + """ + Creates a replacement tag for an OMML (Office Math Markup Language) element. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the "oMath" element. + block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False. + + Returns: + Tag: A BeautifulSoup Tag object representing the replacement element. + """ + t_tag = Tag(name="w:t") + t_tag.string = ( + f"$${_convert_omath_to_latex(tag)}$$" + if block + else f"${_convert_omath_to_latex(tag)}$" + ) + r_tag = Tag(name="w:r") + r_tag.append(t_tag) + return r_tag + + +def _replace_equations(tag: Tag): + """ + Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents. + + Args: + tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath". + + Raises: + ValueError: If the tag is not supported. + """ + if tag.name == "oMathPara": + # Create a new paragraph tag + p_tag = Tag(name="w:p") + # Replace each 'oMath' child tag with its LaTeX equivalent as block equations + for child_tag in tag.find_all("oMath"): + p_tag.append(_get_omath_tag_replacement(child_tag, block=True)) + # Replace the original 'oMathPara' tag with the new paragraph tag + tag.replace_with(p_tag) + elif tag.name == "oMath": + # Replace the 'oMath' tag with its LaTeX equivalent as inline equation + tag.replace_with(_get_omath_tag_replacement(tag, block=False)) + else: + raise ValueError(f"Not supported tag: {tag.name}") + + +def _pre_process_math(content: bytes) -> bytes: + """ + Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX. + This preprocessed content can be directly replaced in the DOCX file -> XMLs. + + Args: + content (bytes): The XML content of the DOCX file as bytes. + + Returns: + bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes. + """ + soup = BeautifulSoup(content.decode(), features="xml") + for tag in soup.find_all("oMathPara"): + _replace_equations(tag) + for tag in soup.find_all("oMath"): + _replace_equations(tag) + return str(soup).encode() + + +def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: + """ + Pre-processes a DOCX file with provided steps. + + The process works by unzipping the DOCX file in memory, transforming specific XML files + (such as converting OMML elements to LaTeX), and then zipping everything back into a + DOCX file without writing to disk. + + Args: + input_docx (BinaryIO): A binary input stream representing the DOCX file. + + Returns: + BinaryIO: A binary output stream representing the processed DOCX file. + """ + output_docx = BytesIO() + # The files that need to be pre-processed from .docx + pre_process_enable_files = [ + "word/document.xml", + "word/footnotes.xml", + "word/endnotes.xml", + ] + with zipfile.ZipFile(input_docx, mode="r") as zip_input: + files = {name: zip_input.read(name) for name in zip_input.namelist()} + with zipfile.ZipFile(output_docx, mode="w") as zip_output: + zip_output.comment = zip_input.comment + for name, content in files.items(): + if name in pre_process_enable_files: + try: + # Pre-process the content + updated_content = _pre_process_math(content) + # In the future, if there are more pre-processing steps, they can be added here + zip_output.writestr(name, updated_content) + except: + # If there is an error in processing the content, write the original content + zip_output.writestr(name, content) + else: + zip_output.writestr(name, content) + output_docx.seek(0) + return output_docx diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index a9c469f..b320695 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -3,6 +3,7 @@ import sys from typing import BinaryIO, Any from ._html_converter import HtmlConverter +from ..converter_utils.docx.pre_process import pre_process_docx from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE @@ -72,6 +73,8 @@ class DocxConverter(HtmlConverter): ) style_map = kwargs.get("style_map", None) + pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( - mammoth.convert_to_html(file_stream, style_map=style_map).value, **kwargs + mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, + **kwargs, ) diff --git a/packages/markitdown/tests/test_files/equations.docx b/packages/markitdown/tests/test_files/equations.docx new file mode 100644 index 0000000..6a05cd7 Binary files /dev/null and b/packages/markitdown/tests/test_files/equations.docx differ diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 33e2c44..1819183 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -m pytest import io import os +import re import shutil import openai import pytest @@ -262,6 +263,19 @@ def test_docx_comments() -> None: validate_strings(result, DOCX_COMMENT_TEST_STRINGS) +def test_docx_equations() -> None: + markitdown = MarkItDown() + docx_file = os.path.join(TEST_FILES_DIR, "equations.docx") + result = markitdown.convert(docx_file) + + # Check for inline equation m=1 (wrapped with single $) is present + assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" + + # Find block equations wrapped with double $$ and check if they are present + block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content) + assert block_equations, "No block equations found in the document." + + def test_input_as_strings() -> None: markitdown = MarkItDown()