From 8a30fca7328179144d7b8de4a66b6480a119a16e Mon Sep 17 00:00:00 2001 From: gagb Date: Fri, 13 Dec 2024 14:57:39 -0800 Subject: [PATCH] Add support for GH prs as well --- src/markitdown/_markitdown.py | 78 +++++++++++++++++++++++++++++------ tests/test_markitdown.py | 13 ++++++ 2 files changed, 79 insertions(+), 12 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 9602300..043d68d 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -846,22 +846,25 @@ class ImageConverter(MediaConverter): class GitHubIssueConverter(DocumentConverter): - """Converts GitHub issues to Markdown.""" + """Converts GitHub issues and pull requests to Markdown.""" - def convert(self, issue_url, github_token) -> Union[None, DocumentConverterResult]: - # Bail if not a valid GitHub issue URL - if issue_url: - parsed_url = urlparse(issue_url) + def convert(self, github_url, github_token) -> Union[None, DocumentConverterResult]: + # Bail if not a valid GitHub issue or pull request URL + if github_url: + parsed_url = urlparse(github_url) path_parts = parsed_url.path.strip("/").split("/") - if len(path_parts) < 4 or path_parts[2] != "issues": + if len(path_parts) < 4 or path_parts[2] not in ["issues", "pull"]: return None if not github_token: raise ValueError( - "GitHub token is not set. Cannot convert GitHub issue." + "GitHub token is not set. Cannot convert GitHub issue or pull request." ) - return self._convert_github_issue(issue_url, github_token) + if path_parts[2] == "issues": + return self._convert_github_issue(github_url, github_token) + elif path_parts[2] == "pull": + return self._convert_github_pr(github_url, github_token) return None @@ -914,6 +917,55 @@ class GitHubIssueConverter(DocumentConverter): text_content=markdown_content, ) + def _convert_github_pr( + self, pr_url: str, github_token: str + ) -> DocumentConverterResult: + """ + Convert a GitHub pull request to a markdown document. + Args: + pr_url (str): The URL of the GitHub pull request to convert. + github_token (str): A GitHub token with access to the repository. + Returns: + DocumentConverterResult: The result containing the pull request title and markdown content. + Raises: + ImportError: If the PyGithub library is not installed. + ValueError: If the provided URL is not a valid GitHub pull request URL. + """ + if not IS_GITHUB_ISSUE_CAPABLE: + raise ImportError( + "PyGithub is not installed. Please install it to use this feature." + ) + + # Parse the pull request URL + parsed_url = urlparse(pr_url) + path_parts = parsed_url.path.strip("/").split("/") + if len(path_parts) < 4 or path_parts[2] != "pull": + raise ValueError("Invalid GitHub pull request URL") + + owner, repo, _, pr_number = path_parts[:4] + + # Authenticate with GitHub + g = Github(github_token) + repo = g.get_repo(f"{owner}/{repo}") + pr = repo.get_pull(int(pr_number)) + + # Convert pull request details to markdown + markdown_content = f"# {pr.title}\n\n{pr.body}\n\n" + markdown_content += f"**State:** {pr.state}\n" + markdown_content += f"**Created at:** {pr.created_at}\n" + markdown_content += f"**Updated at:** {pr.updated_at}\n" + markdown_content += f"**Comments:**\n" + + for comment in pr.get_issue_comments(): + markdown_content += ( + f"- {comment.user.login} ({comment.created_at}): {comment.body}\n" + ) + + return DocumentConverterResult( + title=pr.title, + text_content=markdown_content, + ) + class FileConversionException(BaseException): pass @@ -984,16 +1036,18 @@ class MarkItDown: def convert_url( self, url: str, **kwargs: Any ) -> DocumentConverterResult: # TODO: fix kwargs type - # Handle GitHub issue URLs directly + # Handle GitHub issue and pull request URLs directly parsed_url = urlparse(url) - if parsed_url.hostname == "github.com" and "/issues/" in parsed_url.path: + if parsed_url.hostname == "github.com" and any( + x in parsed_url.path for x in ["/issues/", "/pull/"] + ): github_token = kwargs.get("github_token", os.getenv("GITHUB_TOKEN")) if not github_token: raise ValueError( - "GitHub token is required for GitHub issue conversion." + "GitHub token is required for GitHub issue or pull request conversion." ) return GitHubIssueConverter().convert( - issue_url=url, github_token=github_token + github_url=url, github_token=github_token ) # Send a HTTP request to the URL diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index fa64738..9a35e27 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -88,6 +88,7 @@ SERP_TEST_EXCLUDES = [ ] GITHUB_ISSUE_URL = "https://github.com/microsoft/autogen/issues/1421" +GITHUB_PR_URL = "https://github.com/microsoft/autogen/pull/194" GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") @@ -195,9 +196,21 @@ def test_markitdown_github_issue() -> None: assert "Comments:" in result.text_content +@pytest.mark.skipif( + not GITHUB_TOKEN, + reason="GitHub token not provided", +) +def test_markitdown_github_pr() -> None: + markitdown = MarkItDown() + result = markitdown.convert(GITHUB_PR_URL, github_token=GITHUB_TOKEN) + print(result.text_content) + assert "faq" in result.text_content + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_markitdown_remote() test_markitdown_local() test_markitdown_exiftool() test_markitdown_github_issue() + test_markitdown_github_pr()