feat: add tests of rss convertor

This commit is contained in:
Soulter
2024-12-17 22:45:27 +08:00
parent 7dc2695b96
commit 752fbd333c
3 changed files with 16 additions and 2 deletions

View File

@@ -271,7 +271,7 @@ class RSSConverter(DocumentConverter):
entry_content = self._get_data_by_tag_name(entry, "content") entry_content = self._get_data_by_tag_name(entry, "content")
if entry_title: if entry_title:
md_text += f"## {entry_title}\n" md_text += f"\n## {entry_title}\n"
if entry_updated: if entry_updated:
md_text += f"Updated on: {entry_updated}\n" md_text += f"Updated on: {entry_updated}\n"
if entry_summary: if entry_summary:
@@ -313,7 +313,7 @@ class RSSConverter(DocumentConverter):
content = self._get_data_by_tag_name(item, "content:encoded") content = self._get_data_by_tag_name(item, "content:encoded")
if title: if title:
md_text += f"## {title}\n" md_text += f"\n## {title}\n"
if pubDate: if pubDate:
md_text += f"Published on: {pubDate}\n" md_text += f"Published on: {pubDate}\n"
if description: if description:

1
tests/test_files/test_rss.xml vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -78,6 +78,13 @@ BLOG_TEST_STRINGS = [
"an example where high cost can easily prevent a generic complex", "an example where high cost can easily prevent a generic complex",
] ]
RSS_TEST_STRINGS = [
"The Official Microsoft Blog",
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
]
WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TEST_STRINGS = [ WIKIPEDIA_TEST_STRINGS = [
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@@ -207,6 +214,12 @@ def test_markitdown_local() -> None:
assert test_string not in text_content assert test_string not in text_content
for test_string in SERP_TEST_STRINGS: for test_string in SERP_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
# Test RSS processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS:
assert test_string in text_content
## Test non-UTF-8 encoding ## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))