From 8d5f16ecd2e989d57a980e5445594505c4e525a0 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Tue, 17 Dec 2024 15:27:06 -0800 Subject: [PATCH] Fixed formatting. --- src/markitdown/_markitdown.py | 38 +++++++++++++++++++++-------------- tests/test_markitdown.py | 4 ++-- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 491c555..5839cf4 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -217,14 +217,15 @@ class HtmlConverter(DocumentConverter): assert isinstance(webpage_text, str) - return DocumentConverterResult( + return DocumentConverterResult( title=None if soup.title is None else soup.title.string, text_content=webpage_text, ) + class RSSConverter(DocumentConverter): """Convert RSS / Atom type to markdown""" - + def convert( self, local_path: str, **kwargs ) -> Union[None, DocumentConverterResult]: @@ -250,12 +251,14 @@ class RSSConverter(DocumentConverter): else: # not rss or atom return None - + return result - - def _parse_atom_type(self, doc: minidom.Document) -> Union[None, DocumentConverterResult]: + + def _parse_atom_type( + self, doc: minidom.Document + ) -> Union[None, DocumentConverterResult]: """Parse the type of an Atom feed. - + Returns None if the feed type is not recognized or something goes wrong. """ try: @@ -271,7 +274,7 @@ class RSSConverter(DocumentConverter): entry_summary = self._get_data_by_tag_name(entry, "summary") entry_updated = self._get_data_by_tag_name(entry, "updated") entry_content = self._get_data_by_tag_name(entry, "content") - + if entry_title: md_text += f"\n## {entry_title}\n" if entry_updated: @@ -287,10 +290,12 @@ class RSSConverter(DocumentConverter): ) except BaseException as _: return None - - def _parse_rss_type(self, doc: minidom.Document) -> Union[None, DocumentConverterResult]: + + def _parse_rss_type( + self, doc: minidom.Document + ) -> Union[None, DocumentConverterResult]: """Parse the type of an RSS feed. - + Returns None if the feed type is not recognized or something goes wrong. """ try: @@ -313,7 +318,7 @@ class RSSConverter(DocumentConverter): description = self._get_data_by_tag_name(item, "description") pubDate = self._get_data_by_tag_name(item, "pubDate") content = self._get_data_by_tag_name(item, "content:encoded") - + if title: md_text += f"\n## {title}\n" if pubDate: @@ -322,7 +327,7 @@ class RSSConverter(DocumentConverter): md_text += self._parse_content(description) if content: md_text += self._parse_content(content) - + return DocumentConverterResult( title=channel_title, text_content=md_text, @@ -330,7 +335,7 @@ class RSSConverter(DocumentConverter): except BaseException as _: print(traceback.format_exc()) return None - + def _parse_content(self, content: str) -> str: """Parse the content of an RSS feed item""" try: @@ -339,8 +344,10 @@ class RSSConverter(DocumentConverter): return _CustomMarkdownify().convert_soup(soup) except BaseException as _: return content - - def _get_data_by_tag_name(self, element: minidom.Element, tag_name: str) -> Union[str, None]: + + def _get_data_by_tag_name( + self, element: minidom.Element, tag_name: str + ) -> Union[str, None]: """Get data from first child element with the given tag name. Returns None when no such element is found. """ @@ -352,6 +359,7 @@ class RSSConverter(DocumentConverter): return fc.data return None + class WikipediaConverter(DocumentConverter): """Handle Wikipedia pages separately, focusing only on the main document content.""" diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 9ccb39e..316e670 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -95,7 +95,7 @@ RSS_TEST_STRINGS = [ "The Official Microsoft Blog", "In the case of AI, it is absolutely true that the industry is moving incredibly fast", ] - + WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" WIKIPEDIA_TEST_STRINGS = [ @@ -230,7 +230,7 @@ def test_markitdown_local() -> None: assert test_string not in text_content for test_string in SERP_TEST_STRINGS: assert test_string in text_content - + # Test RSS processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml")) text_content = result.text_content.replace("\\", "")