Switched from the stdlib minidom parser to defusedxml. (#1259)
This commit is contained in:
@@ -29,6 +29,7 @@ dependencies = [
|
||||
"markdownify",
|
||||
"magika~=0.6.1",
|
||||
"charset-normalizer",
|
||||
"defusedxml",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import zipfile
|
||||
import xml.dom.minidom as minidom
|
||||
from defusedxml import minidom
|
||||
from xml.dom.minidom import Document
|
||||
|
||||
from typing import BinaryIO, Any, Dict, List
|
||||
|
||||
@@ -128,7 +129,7 @@ class EpubConverter(HtmlConverter):
|
||||
markdown="\n\n".join(markdown_content), title=metadata["title"]
|
||||
)
|
||||
|
||||
def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None:
|
||||
def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
|
||||
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
|
||||
texts = self._get_all_texts_from_nodes(dom, tag_name)
|
||||
if len(texts) > 0:
|
||||
@@ -136,9 +137,7 @@ class EpubConverter(HtmlConverter):
|
||||
else:
|
||||
return None
|
||||
|
||||
def _get_all_texts_from_nodes(
|
||||
self, dom: minidom.Document, tag_name: str
|
||||
) -> List[str]:
|
||||
def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
|
||||
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
|
||||
texts: List[str] = []
|
||||
for node in dom.getElementsByTagName(tag_name):
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from xml.dom import minidom
|
||||
from defusedxml import minidom
|
||||
from xml.dom.minidom import Document, Element
|
||||
from typing import BinaryIO, Any, Union
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -97,7 +98,7 @@ class RssConverter(DocumentConverter):
|
||||
else:
|
||||
raise ValueError("Unknown feed type")
|
||||
|
||||
def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||
def _parse_atom_type(self, doc: Document) -> DocumentConverterResult:
|
||||
"""Parse the type of an Atom feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
@@ -129,7 +130,7 @@ class RssConverter(DocumentConverter):
|
||||
title=title,
|
||||
)
|
||||
|
||||
def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult:
|
||||
def _parse_rss_type(self, doc: Document) -> DocumentConverterResult:
|
||||
"""Parse the type of an RSS feed.
|
||||
|
||||
Returns None if the feed type is not recognized or something goes wrong.
|
||||
@@ -176,7 +177,7 @@ class RssConverter(DocumentConverter):
|
||||
return content
|
||||
|
||||
def _get_data_by_tag_name(
|
||||
self, element: minidom.Element, tag_name: str
|
||||
self, element: Element, tag_name: str
|
||||
) -> Union[str, None]:
|
||||
"""Get data from first child element with the given tag name.
|
||||
Returns None when no such element is found.
|
||||
|
||||
Reference in New Issue
Block a user