Compare commits
11 Commits
v0.0.1a2
...
gagb/add-g
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0c25a086e7 | ||
|
|
70ab149ff1 | ||
|
|
8a30fca732 | ||
|
|
0b6554738c | ||
|
|
f1274dca87 | ||
|
|
778fca3f70 | ||
|
|
7979eecfef | ||
|
|
3b88696777 | ||
|
|
8f16f32d53 | ||
|
|
28af7ad341 | ||
|
|
9d047103d5 |
@@ -16,11 +16,10 @@ authors = [
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
@@ -39,6 +38,7 @@ dependencies = [
|
||||
"youtube-transcript-api",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"pygithub"
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
|
||||
@@ -44,6 +44,14 @@ try:
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
# Optional GitHub issue support
|
||||
try:
|
||||
from github import Github
|
||||
|
||||
IS_GITHUB_ISSUE_CAPABLE = True
|
||||
except ModuleNotFoundError:
|
||||
IS_GITHUB_ISSUE_CAPABLE = False
|
||||
|
||||
|
||||
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||
"""
|
||||
@@ -837,6 +845,128 @@ class ImageConverter(MediaConverter):
|
||||
return response.choices[0].message.content
|
||||
|
||||
|
||||
class GitHubIssueConverter(DocumentConverter):
|
||||
"""Converts GitHub issues and pull requests to Markdown."""
|
||||
|
||||
def convert(self, github_url, github_token) -> Union[None, DocumentConverterResult]:
|
||||
# Bail if not a valid GitHub issue or pull request URL
|
||||
if github_url:
|
||||
parsed_url = urlparse(github_url)
|
||||
path_parts = parsed_url.path.strip("/").split("/")
|
||||
if len(path_parts) < 4 or path_parts[2] not in ["issues", "pull"]:
|
||||
return None
|
||||
|
||||
if not github_token:
|
||||
raise ValueError(
|
||||
"GitHub token is not set. Cannot convert GitHub issue or pull request."
|
||||
)
|
||||
|
||||
if path_parts[2] == "issues":
|
||||
return self._convert_github_issue(github_url, github_token)
|
||||
elif path_parts[2] == "pull":
|
||||
return self._convert_github_pr(github_url, github_token)
|
||||
|
||||
return None
|
||||
|
||||
def _convert_github_issue(
|
||||
self, issue_url: str, github_token: str
|
||||
) -> DocumentConverterResult:
|
||||
"""
|
||||
Convert a GitHub issue to a markdown document.
|
||||
Args:
|
||||
issue_url (str): The URL of the GitHub issue to convert.
|
||||
github_token (str): A GitHub token with access to the repository.
|
||||
Returns:
|
||||
DocumentConverterResult: The result containing the issue title and markdown content.
|
||||
Raises:
|
||||
ImportError: If the PyGithub library is not installed.
|
||||
ValueError: If the provided URL is not a valid GitHub issue URL.
|
||||
"""
|
||||
if not IS_GITHUB_ISSUE_CAPABLE:
|
||||
raise ImportError(
|
||||
"PyGithub is not installed. Please install it to use this feature."
|
||||
)
|
||||
|
||||
# Parse the issue URL
|
||||
parsed_url = urlparse(issue_url)
|
||||
path_parts = parsed_url.path.strip("/").split("/")
|
||||
if len(path_parts) < 4 or path_parts[2] != "issues":
|
||||
raise ValueError("Invalid GitHub issue URL")
|
||||
|
||||
owner, repo, _, issue_number = path_parts[:4]
|
||||
|
||||
# Authenticate with GitHub
|
||||
g = Github(github_token)
|
||||
repo = g.get_repo(f"{owner}/{repo}")
|
||||
issue = repo.get_issue(int(issue_number))
|
||||
|
||||
# Convert issue details to markdown
|
||||
markdown_content = f"# {issue.title}\n\n{issue.body}\n\n"
|
||||
markdown_content += f"**State:** {issue.state}\n"
|
||||
markdown_content += f"**Created at:** {issue.created_at}\n"
|
||||
markdown_content += f"**Updated at:** {issue.updated_at}\n"
|
||||
markdown_content += f"**Comments:**\n"
|
||||
|
||||
for comment in issue.get_comments():
|
||||
markdown_content += (
|
||||
f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=issue.title,
|
||||
text_content=markdown_content,
|
||||
)
|
||||
|
||||
def _convert_github_pr(
|
||||
self, pr_url: str, github_token: str
|
||||
) -> DocumentConverterResult:
|
||||
"""
|
||||
Convert a GitHub pull request to a markdown document.
|
||||
Args:
|
||||
pr_url (str): The URL of the GitHub pull request to convert.
|
||||
github_token (str): A GitHub token with access to the repository.
|
||||
Returns:
|
||||
DocumentConverterResult: The result containing the pull request title and markdown content.
|
||||
Raises:
|
||||
ImportError: If the PyGithub library is not installed.
|
||||
ValueError: If the provided URL is not a valid GitHub pull request URL.
|
||||
"""
|
||||
if not IS_GITHUB_ISSUE_CAPABLE:
|
||||
raise ImportError(
|
||||
"PyGithub is not installed. Please install it to use this feature."
|
||||
)
|
||||
|
||||
# Parse the pull request URL
|
||||
parsed_url = urlparse(pr_url)
|
||||
path_parts = parsed_url.path.strip("/").split("/")
|
||||
if len(path_parts) < 4 or path_parts[2] != "pull":
|
||||
raise ValueError("Invalid GitHub pull request URL")
|
||||
|
||||
owner, repo, _, pr_number = path_parts[:4]
|
||||
|
||||
# Authenticate with GitHub
|
||||
g = Github(github_token)
|
||||
repo = g.get_repo(f"{owner}/{repo}")
|
||||
pr = repo.get_pull(int(pr_number))
|
||||
|
||||
# Convert pull request details to markdown
|
||||
markdown_content = f"# {pr.title}\n\n{pr.body}\n\n"
|
||||
markdown_content += f"**State:** {pr.state}\n"
|
||||
markdown_content += f"**Created at:** {pr.created_at}\n"
|
||||
markdown_content += f"**Updated at:** {pr.updated_at}\n"
|
||||
markdown_content += f"**Comments:**\n"
|
||||
|
||||
for comment in pr.get_issue_comments():
|
||||
markdown_content += (
|
||||
f"- {comment.user.login} ({comment.created_at}): {comment.body}\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
title=pr.title,
|
||||
text_content=markdown_content,
|
||||
)
|
||||
|
||||
|
||||
class FileConversionException(BaseException):
|
||||
pass
|
||||
|
||||
@@ -889,7 +1019,6 @@ class MarkItDown:
|
||||
- source: can be a string representing a path or url, or a requests.response object
|
||||
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
||||
"""
|
||||
|
||||
# Local path or url
|
||||
if isinstance(source, str):
|
||||
if (
|
||||
@@ -904,6 +1033,28 @@ class MarkItDown:
|
||||
elif isinstance(source, requests.Response):
|
||||
return self.convert_response(source, **kwargs)
|
||||
|
||||
def convert_url(
|
||||
self, url: str, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: fix kwargs type
|
||||
# Handle GitHub issue and pull request URLs directly
|
||||
parsed_url = urlparse(url)
|
||||
if parsed_url.hostname == "github.com" and any(
|
||||
x in parsed_url.path for x in ["/issues/", "/pull/"]
|
||||
):
|
||||
github_token = kwargs.get("github_token", os.getenv("GITHUB_TOKEN"))
|
||||
if not github_token:
|
||||
raise ValueError(
|
||||
"GitHub token is required for GitHub issue or pull request conversion."
|
||||
)
|
||||
return GitHubIssueConverter().convert(
|
||||
github_url=url, github_token=github_token
|
||||
)
|
||||
|
||||
# Send a HTTP request to the URL
|
||||
response = self._requests_session.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
return self.convert_response(response, **kwargs)
|
||||
|
||||
def convert_local(
|
||||
self, path: str, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
@@ -958,14 +1109,6 @@ class MarkItDown:
|
||||
|
||||
return result
|
||||
|
||||
def convert_url(
|
||||
self, url: str, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO: fix kwargs type
|
||||
# Send a HTTP request to the URL
|
||||
response = self._requests_session.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
return self.convert_response(response, **kwargs)
|
||||
|
||||
def convert_response(
|
||||
self, response: requests.Response, **kwargs: Any
|
||||
) -> DocumentConverterResult: # TODO fix kwargs type
|
||||
|
||||
@@ -87,6 +87,10 @@ SERP_TEST_EXCLUDES = [
|
||||
"data:image/svg+xml,%3Csvg%20width%3D",
|
||||
]
|
||||
|
||||
GITHUB_ISSUE_URL = "https://github.com/microsoft/autogen/issues/1421"
|
||||
GITHUB_PR_URL = "https://github.com/microsoft/autogen/pull/194"
|
||||
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
@@ -179,8 +183,34 @@ def test_markitdown_exiftool() -> None:
|
||||
assert target in result.text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not GITHUB_TOKEN,
|
||||
reason="GitHub token not provided",
|
||||
)
|
||||
def test_markitdown_github_issue() -> None:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(GITHUB_ISSUE_URL, github_token=GITHUB_TOKEN)
|
||||
print(result.text_content)
|
||||
assert "User-Defined Functions" in result.text_content
|
||||
assert "closed" in result.text_content
|
||||
assert "Comments:" in result.text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not GITHUB_TOKEN,
|
||||
reason="GitHub token not provided",
|
||||
)
|
||||
def test_markitdown_github_pr() -> None:
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(GITHUB_PR_URL, github_token=GITHUB_TOKEN)
|
||||
print(result.text_content)
|
||||
assert "faq" in result.text_content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_markitdown_remote()
|
||||
test_markitdown_local()
|
||||
test_markitdown_exiftool()
|
||||
test_markitdown_github_issue()
|
||||
test_markitdown_github_pr()
|
||||
|
||||
Reference in New Issue
Block a user