Switched from the stdlib minidom parser to defusedxml. (#1259)

This commit is contained in:
afourney
2025-05-21 09:47:14 -07:00
committed by GitHub
parent 041be54471
commit bbcf876b18
3 changed files with 10 additions and 9 deletions

View File

@@ -29,6 +29,7 @@ dependencies = [
"markdownify", "markdownify",
"magika~=0.6.1", "magika~=0.6.1",
"charset-normalizer", "charset-normalizer",
"defusedxml",
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@@ -1,6 +1,7 @@
import os import os
import zipfile import zipfile
import xml.dom.minidom as minidom from defusedxml import minidom
from xml.dom.minidom import Document
from typing import BinaryIO, Any, Dict, List from typing import BinaryIO, Any, Dict, List
@@ -128,7 +129,7 @@ class EpubConverter(HtmlConverter):
markdown="\n\n".join(markdown_content), title=metadata["title"] markdown="\n\n".join(markdown_content), title=metadata["title"]
) )
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None: def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
"""Convenience function to extract a single occurrence of a tag (e.g., title).""" """Convenience function to extract a single occurrence of a tag (e.g., title)."""
texts = self._get_all_texts_from_nodes(dom, tag_name) texts = self._get_all_texts_from_nodes(dom, tag_name)
if len(texts) > 0: if len(texts) > 0:
@@ -136,9 +137,7 @@ class EpubConverter(HtmlConverter):
else: else:
return None return None
def _get_all_texts_from_nodes( def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
self, dom: minidom.Document, tag_name: str
) -> List[str]:
"""Helper function to extract all occurrences of a tag (e.g., multiple authors).""" """Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
texts: List[str] = [] texts: List[str] = []
for node in dom.getElementsByTagName(tag_name): for node in dom.getElementsByTagName(tag_name):

View File

@@ -1,4 +1,5 @@
from xml.dom import minidom from defusedxml import minidom
from xml.dom.minidom import Document, Element
from typing import BinaryIO, Any, Union from typing import BinaryIO, Any, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -97,7 +98,7 @@ class RssConverter(DocumentConverter):
else: else:
raise ValueError("Unknown feed type") raise ValueError("Unknown feed type")
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult: def _parse_atom_type(self, doc: Document) -> DocumentConverterResult:
"""Parse the type of an Atom feed. """Parse the type of an Atom feed.
Returns None if the feed type is not recognized or something goes wrong. Returns None if the feed type is not recognized or something goes wrong.
@@ -129,7 +130,7 @@ class RssConverter(DocumentConverter):
title=title, title=title,
) )
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult: def _parse_rss_type(self, doc: Document) -> DocumentConverterResult:
"""Parse the type of an RSS feed. """Parse the type of an RSS feed.
Returns None if the feed type is not recognized or something goes wrong. Returns None if the feed type is not recognized or something goes wrong.
@@ -176,7 +177,7 @@ class RssConverter(DocumentConverter):
return content return content
def _get_data_by_tag_name( def _get_data_by_tag_name(
self, element: minidom.Element, tag_name: str self, element: Element, tag_name: str
) -> Union[str, None]: ) -> Union[str, None]:
"""Get data from first child element with the given tag name. """Get data from first child element with the given tag name.
Returns None when no such element is found. Returns None when no such element is found.