Switched from the stdlib minidom parser to defusedxml. (#1259)

2025-05-21 09:47:14 -07:00
parent 041be54471
commit bbcf876b18
3 changed files with 10 additions and 9 deletions
--- a/packages/markitdown/pyproject.toml
+++ b/packages/markitdown/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
  "markdownify",
  "magika~=0.6.1",
  "charset-normalizer",
+  "defusedxml",
 ]

 [project.optional-dependencies]
--- a/packages/markitdown/src/markitdown/converters/_epub_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py
@@ -1,6 +1,7 @@
 import os
 import zipfile
-import xml.dom.minidom as minidom
+from defusedxml import minidom
+from xml.dom.minidom import Document

 from typing import BinaryIO, Any, Dict, List

@@ -128,7 +129,7 @@ class EpubConverter(HtmlConverter):
                markdown="\n\n".join(markdown_content), title=metadata["title"]
            )

-    def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
+    def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
        """Convenience function to extract a single occurrence of a tag (e.g., title)."""
        texts = self._get_all_texts_from_nodes(dom, tag_name)
        if len(texts) > 0:
@@ -136,9 +137,7 @@ class EpubConverter(HtmlConverter):
        else:
            return None

-    def _get_all_texts_from_nodes(
-        self, dom: minidom.Document, tag_name: str
-    ) -> List[str]:
+    def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
        """Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
        texts: List[str] = []
        for node in dom.getElementsByTagName(tag_name):
--- a/packages/markitdown/src/markitdown/converters/_rss_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -1,4 +1,5 @@
-from xml.dom import minidom
+from defusedxml import minidom
+from xml.dom.minidom import Document, Element
 from typing import BinaryIO, Any, Union
 from bs4 import BeautifulSoup

@@ -97,7 +98,7 @@ class RssConverter(DocumentConverter):
        else:
            raise ValueError("Unknown feed type")

-    def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
+    def _parse_atom_type(self, doc: Document) -> DocumentConverterResult:
        """Parse the type of an Atom feed.

        Returns None if the feed type is not recognized or something goes wrong.
@@ -129,7 +130,7 @@ class RssConverter(DocumentConverter):
            title=title,
        )

-    def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
+    def _parse_rss_type(self, doc: Document) -> DocumentConverterResult:
        """Parse the type of an RSS feed.

        Returns None if the feed type is not recognized or something goes wrong.
@@ -176,7 +177,7 @@ class RssConverter(DocumentConverter):
            return content

    def _get_data_by_tag_name(
-        self, element: minidom.Element, tag_name: str
+        self, element: Element, tag_name: str
    ) -> Union[str, None]:
        """Get data from first child element with the given tag name.
        Returns None when no such element is found.