Merge branch 'main' into main

This commit is contained in:
afourney
2024-12-17 15:37:28 -08:00
committed by GitHub
10 changed files with 265 additions and 5 deletions

View File

@@ -39,6 +39,7 @@ dependencies = [
"SpeechRecognition", "SpeechRecognition",
"pathvalidate", "pathvalidate",
"charset-normalizer", "charset-normalizer",
"openai",
] ]
[project.urls] [project.urls]
@@ -77,3 +78,6 @@ exclude_lines = [
"if __name__ == .__main__.:", "if __name__ == .__main__.:",
"if TYPE_CHECKING:", "if TYPE_CHECKING:",
] ]
[tool.hatch.build.targets.sdist]
only-include = ["src/markitdown"]

View File

@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com> # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
# #
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
__version__ = "0.0.1a1" __version__ = "0.0.1a3"

View File

@@ -13,9 +13,10 @@ import sys
import tempfile import tempfile
import traceback import traceback
import zipfile import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import catch_warnings from warnings import warn, resetwarnings, catch_warnings
import mammoth import mammoth
import markdownify import markdownify
@@ -44,6 +45,8 @@ try:
IS_AUDIO_TRANSCRIPTION_CAPABLE = True IS_AUDIO_TRANSCRIPTION_CAPABLE = True
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
finally:
resetwarnings()
# Optional YouTube transcription support # Optional YouTube transcription support
try: try:
@@ -220,6 +223,143 @@ class HtmlConverter(DocumentConverter):
) )
class RSSConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""
def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not RSS type
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xml", ".rss", ".atom"]:
return None
try:
doc = minidom.parse(local_path)
except BaseException as _:
return None
result = None
if doc.getElementsByTagName("rss"):
# A RSS feed must have a root element of <rss>
result = self._parse_rss_type(doc)
elif doc.getElementsByTagName("feed"):
root = doc.getElementsByTagName("feed")[0]
if root.getElementsByTagName("entry"):
# An Atom feed must have a root element of <feed> and at least one <entry>
result = self._parse_atom_type(doc)
else:
return None
else:
# not rss or atom
return None
return result
def _parse_atom_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an Atom feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("feed")[0]
title = self._get_data_by_tag_name(root, "title")
subtitle = self._get_data_by_tag_name(root, "subtitle")
entries = root.getElementsByTagName("entry")
md_text = f"# {title}\n"
if subtitle:
md_text += f"{subtitle}\n"
for entry in entries:
entry_title = self._get_data_by_tag_name(entry, "title")
entry_summary = self._get_data_by_tag_name(entry, "summary")
entry_updated = self._get_data_by_tag_name(entry, "updated")
entry_content = self._get_data_by_tag_name(entry, "content")
if entry_title:
md_text += f"\n## {entry_title}\n"
if entry_updated:
md_text += f"Updated on: {entry_updated}\n"
if entry_summary:
md_text += self._parse_content(entry_summary)
if entry_content:
md_text += self._parse_content(entry_content)
return DocumentConverterResult(
title=title,
text_content=md_text,
)
except BaseException as _:
return None
def _parse_rss_type(
self, doc: minidom.Document
) -> Union[None, DocumentConverterResult]:
"""Parse the type of an RSS feed.
Returns None if the feed type is not recognized or something goes wrong.
"""
try:
root = doc.getElementsByTagName("rss")[0]
channel = root.getElementsByTagName("channel")
if not channel:
return None
channel = channel[0]
channel_title = self._get_data_by_tag_name(channel, "title")
channel_description = self._get_data_by_tag_name(channel, "description")
items = channel.getElementsByTagName("item")
if channel_title:
md_text = f"# {channel_title}\n"
if channel_description:
md_text += f"{channel_description}\n"
if not items:
items = []
for item in items:
title = self._get_data_by_tag_name(item, "title")
description = self._get_data_by_tag_name(item, "description")
pubDate = self._get_data_by_tag_name(item, "pubDate")
content = self._get_data_by_tag_name(item, "content:encoded")
if title:
md_text += f"\n## {title}\n"
if pubDate:
md_text += f"Published on: {pubDate}\n"
if description:
md_text += self._parse_content(description)
if content:
md_text += self._parse_content(content)
return DocumentConverterResult(
title=channel_title,
text_content=md_text,
)
except BaseException as _:
print(traceback.format_exc())
return None
def _parse_content(self, content: str) -> str:
"""Parse the content of an RSS feed item"""
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup)
except BaseException as _:
return content
def _get_data_by_tag_name(
self, element: minidom.Element, tag_name: str
) -> Union[str, None]:
"""Get data from first child element with the given tag name.
Returns None when no such element is found.
"""
nodes = element.getElementsByTagName(tag_name)
if not nodes:
return None
fc = nodes[0].firstChild
if fc:
return fc.data
return None
class WikipediaConverter(DocumentConverter): class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content.""" """Handle Wikipedia pages separately, focusing only on the main document content."""
@@ -908,8 +1048,6 @@ class ImageConverter(MediaConverter):
if prompt is None or prompt.strip() == "": if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image." prompt = "Write a detailed caption for this image."
sys.stderr.write(f"llm Prompt:\n{prompt}\n")
data_uri = "" data_uri = ""
with open(local_path, "rb") as image_file: with open(local_path, "rb") as image_file:
content_type, encoding = mimetypes.guess_type("_dummy" + extension) content_type, encoding = mimetypes.guess_type("_dummy" + extension)
@@ -1071,14 +1209,46 @@ class MarkItDown:
self, self,
requests_session: Optional[requests.Session] = None, requests_session: Optional[requests.Session] = None,
llm_client: Optional[Any] = None, llm_client: Optional[Any] = None,
llm_model: Optional[Any] = None, llm_model: Optional[str] = None,
style_map: Optional[str] = None, style_map: Optional[str] = None,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
): ):
if requests_session is None: if requests_session is None:
self._requests_session = requests.Session() self._requests_session = requests.Session()
else: else:
self._requests_session = requests_session self._requests_session = requests_session
# Handle deprecation notices
#############################
if mlm_client is not None:
if llm_client is None:
warn(
"'mlm_client' is deprecated, and was renamed 'llm_client'.",
DeprecationWarning,
)
llm_client = mlm_client
mlm_client = None
else:
raise ValueError(
"'mlm_client' is deprecated, and was renamed 'llm_client'. Do not use both at the same time. Just use 'llm_client' instead."
)
if mlm_model is not None:
if llm_model is None:
warn(
"'mlm_model' is deprecated, and was renamed 'llm_model'.",
DeprecationWarning,
)
llm_model = mlm_model
mlm_model = None
else:
raise ValueError(
"'mlm_model' is deprecated, and was renamed 'llm_model'. Do not use both at the same time. Just use 'llm_model' instead."
)
#############################
self._llm_client = llm_client self._llm_client = llm_client
self._llm_model = llm_model self._llm_model = llm_model
self._style_map = style_map self._style_map = style_map
@@ -1090,6 +1260,7 @@ class MarkItDown:
# To this end, the most specific converters should appear below the most generic converters # To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter()) self.register_page_converter(PlainTextConverter())
self.register_page_converter(HtmlConverter()) self.register_page_converter(HtmlConverter())
self.register_page_converter(RSSConverter())
self.register_page_converter(WikipediaConverter()) self.register_page_converter(WikipediaConverter())
self.register_page_converter(YouTubeConverter()) self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter()) self.register_page_converter(BingSerpConverter())

0
tests/test_files/test.docx vendored Executable file → Normal file
View File

0
tests/test_files/test.jpg vendored Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 463 KiB

After

Width:  |  Height:  |  Size: 463 KiB

0
tests/test_files/test.xlsx vendored Executable file → Normal file
View File

BIN
tests/test_files/test_llm.jpg vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

1
tests/test_files/test_rss.xml vendored Normal file

File diff suppressed because one or more lines are too long

0
tests/test_files/test_with_comment.docx vendored Executable file → Normal file
View File

View File

@@ -6,11 +6,23 @@ import shutil
import pytest import pytest
import requests import requests
from warnings import catch_warnings, resetwarnings
from markitdown import MarkItDown from markitdown import MarkItDown
skip_remote = ( skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False True if os.environ.get("GITHUB_ACTIONS") else False
) # Don't run these tests in CI ) # Don't run these tests in CI
# Don't run the llm tests without a key and the client library
skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
try:
import openai
except ModuleNotFoundError:
skip_llm = True
# Skip exiftool tests if not installed
skip_exiftool = shutil.which("exiftool") is None skip_exiftool = shutil.which("exiftool") is None
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -78,6 +90,13 @@ BLOG_TEST_STRINGS = [
"an example where high cost can easily prevent a generic complex", "an example where high cost can easily prevent a generic complex",
] ]
RSS_TEST_STRINGS = [
"The Official Microsoft Blog",
"In the case of AI, it is absolutely true that the industry is moving incredibly fast",
]
WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft" WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TEST_STRINGS = [ WIKIPEDIA_TEST_STRINGS = [
"Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
@@ -107,6 +126,10 @@ CSV_CP932_TEST_STRINGS = [
"髙橋淳,35,名古屋", "髙橋淳,35,名古屋",
] ]
LLM_TEST_STRINGS = [
"5bda1dd6",
]
@pytest.mark.skipif( @pytest.mark.skipif(
skip_remote, skip_remote,
@@ -208,6 +231,12 @@ def test_markitdown_local() -> None:
for test_string in SERP_TEST_STRINGS: for test_string in SERP_TEST_STRINGS:
assert test_string in text_content assert test_string in text_content
# Test RSS processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
text_content = result.text_content.replace("\\", "")
for test_string in RSS_TEST_STRINGS:
assert test_string in text_content
## Test non-UTF-8 encoding ## Test non-UTF-8 encoding
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv")) result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
text_content = result.text_content.replace("\\", "") text_content = result.text_content.replace("\\", "")
@@ -229,8 +258,63 @@ def test_markitdown_exiftool() -> None:
assert target in result.text_content assert target in result.text_content
def test_markitdown_deprecation() -> None:
try:
with catch_warnings(record=True) as w:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client)
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_client == test_client
finally:
resetwarnings()
try:
with catch_warnings(record=True) as w:
markitdown = MarkItDown(mlm_model="gpt-4o")
assert len(w) == 1
assert w[0].category is DeprecationWarning
assert markitdown._llm_model == "gpt-4o"
finally:
resetwarnings()
try:
test_client = object()
markitdown = MarkItDown(mlm_client=test_client, llm_client=test_client)
assert False
except ValueError:
pass
try:
markitdown = MarkItDown(mlm_model="gpt-4o", llm_model="gpt-4o")
assert False
except ValueError:
pass
@pytest.mark.skipif(
skip_llm,
reason="do not run llm tests without a key",
)
def test_markitdown_llm() -> None:
client = openai.OpenAI()
markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
for test_string in LLM_TEST_STRINGS:
assert test_string in result.text_content
# This is not super precise. It would also accept "red square", "blue circle",
# "the square is not blue", etc. But it's sufficient for this test.
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()
if __name__ == "__main__": if __name__ == "__main__":
"""Runs this file's tests from the command line.""" """Runs this file's tests from the command line."""
test_markitdown_remote() test_markitdown_remote()
test_markitdown_local() test_markitdown_local()
test_markitdown_exiftool() test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()