diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 79f67d2..0fe6b35 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "markdownify", "magika~=0.6.1", "charset-normalizer", + "defusedxml", ] [project.optional-dependencies] diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py index 17d6d29..73f2955 100644 --- a/packages/markitdown/src/markitdown/converters/_epub_converter.py +++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py @@ -1,6 +1,7 @@ import os import zipfile -import xml.dom.minidom as minidom +from defusedxml import minidom +from xml.dom.minidom import Document from typing import BinaryIO, Any, Dict, List @@ -128,7 +129,7 @@ class EpubConverter(HtmlConverter): markdown="\n\n".join(markdown_content), title=metadata["title"] ) - def _get_text_from_node(self, dom: minidom.Document, tag_name: str) -> str | None: + def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None: """Convenience function to extract a single occurrence of a tag (e.g., title).""" texts = self._get_all_texts_from_nodes(dom, tag_name) if len(texts) > 0: @@ -136,9 +137,7 @@ class EpubConverter(HtmlConverter): else: return None - def _get_all_texts_from_nodes( - self, dom: minidom.Document, tag_name: str - ) -> List[str]: + def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]: """Helper function to extract all occurrences of a tag (e.g., multiple authors).""" texts: List[str] = [] for node in dom.getElementsByTagName(tag_name): diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index 6a0e4c1..bec4248 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -1,4 +1,5 @@ -from xml.dom import minidom +from defusedxml import minidom +from xml.dom.minidom import Document, Element from typing import BinaryIO, Any, Union from bs4 import BeautifulSoup @@ -97,7 +98,7 @@ class RssConverter(DocumentConverter): else: raise ValueError("Unknown feed type") - def _parse_atom_type(self, doc: minidom.Document) -> DocumentConverterResult: + def _parse_atom_type(self, doc: Document) -> DocumentConverterResult: """Parse the type of an Atom feed. Returns None if the feed type is not recognized or something goes wrong. @@ -129,7 +130,7 @@ class RssConverter(DocumentConverter): title=title, ) - def _parse_rss_type(self, doc: minidom.Document) -> DocumentConverterResult: + def _parse_rss_type(self, doc: Document) -> DocumentConverterResult: """Parse the type of an RSS feed. Returns None if the feed type is not recognized or something goes wrong. @@ -176,7 +177,7 @@ class RssConverter(DocumentConverter): return content def _get_data_by_tag_name( - self, element: minidom.Element, tag_name: str + self, element: Element, tag_name: str ) -> Union[str, None]: """Get data from first child element with the given tag name. Returns None when no such element is found.