Switched from the stdlib minidom parser to defusedxml. (#1259)

This commit is contained in:
afourney
2025-05-21 09:47:14 -07:00
committed by GitHub
parent 041be54471
commit bbcf876b18
3 changed files with 10 additions and 9 deletions

View File

@@ -29,6 +29,7 @@ dependencies = [
"markdownify",
"magika~=0.6.1",
"charset-normalizer",
"defusedxml",
]
[project.optional-dependencies]

View File

@@ -1,6 +1,7 @@
import os
import zipfile
import xml.dom.minidom as minidom
from defusedxml import minidom
from xml.dom.minidom import Document
from typing import BinaryIO, Any, Dict, List
@@ -128,7 +129,7 @@ class EpubConverter(HtmlConverter):
markdown="\n\n".join(markdown_content), title=metadata["title"]
)
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name)
if len(texts) > 0:
@@ -136,9 +137,7 @@ class EpubConverter(HtmlConverter):
else:
return None
def _get_all_texts_from_nodes(
self, dom: minidom.Document, tag_name: str
) -> List[str]:
def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
texts: List[str] = []
for node in dom.getElementsByTagName(tag_name):

View File

@@ -1,4 +1,5 @@
from xml.dom import minidom
from defusedxml import minidom
from xml.dom.minidom import Document, Element
from typing import BinaryIO, Any, Union
from bs4 import BeautifulSoup
@@ -97,7 +98,7 @@ class RssConverter(DocumentConverter):
else:
raise ValueError("Unknown feed type")
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
def _parse_atom_type(self, doc: Document) -> DocumentConverterResult:
"""Parse the type of an Atom feed.
Returns None if the feed type is not recognized or something goes wrong.
@@ -129,7 +130,7 @@ class RssConverter(DocumentConverter):
title=title,
)
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
def _parse_rss_type(self, doc: Document) -> DocumentConverterResult:
"""Parse the type of an RSS feed.
Returns None if the feed type is not recognized or something goes wrong.
@@ -176,7 +177,7 @@ class RssConverter(DocumentConverter):
return content
def _get_data_by_tag_name(
self, element: minidom.Element, tag_name: str
self, element: Element, tag_name: str
) -> Union[str, None]:
"""Get data from first child element with the given tag name.
Returns None when no such element is found.