Add Github Polling and Issues (#424)

2025-09-19 20:24:32 +02:00 · 2023-09-09 23:11:00 -07:00
parent 4a0c2bf866
commit f126dfdbd0
2 changed files with 113 additions and 30 deletions
--- a/backend/danswer/connectors/github/connector.py
+++ b/backend/danswer/connectors/github/connector.py
@@ -1,8 +1,11 @@
 import itertools
-from collections.abc import Generator
+from collections.abc import Iterator
+from datetime import datetime
 from typing import Any
+from typing import cast

 from github import Github
+from github.Issue import Issue
 from github.PaginatedList import PaginatedList
 from github.PullRequest import PullRequest

@@ -10,6 +13,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
+from danswer.connectors.interfaces import SecondsSinceUnixEpoch
 from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
@@ -19,10 +23,10 @@ from danswer.utils.logger import setup_logger
 logger = setup_logger()


-def get_pr_batches(
-    pull_requests: PaginatedList, batch_size: int
-) -> Generator[list[PullRequest], None, None]:
-    it = iter(pull_requests)
+def _batch_github_objects(
+    git_objs: PaginatedList, batch_size: int
+) -> Iterator[list[PullRequest | Issue]]:
+    it = iter(git_objs)
    while True:
        batch = list(itertools.islice(it, batch_size))
        if not batch:
@@ -30,6 +34,40 @@ def get_pr_batches(
        yield batch


+def _convert_pr_to_document(pull_request: PullRequest) -> Document:
+    full_context = f"Pull-Request {pull_request.title}\n{pull_request.body}"
+    return Document(
+        id=pull_request.html_url,
+        sections=[Section(link=pull_request.html_url, text=full_context)],
+        source=DocumentSource.GITHUB,
+        semantic_identifier=pull_request.title,
+        metadata={
+            "last_modified": pull_request.last_modified,
+            "merged": pull_request.merged,
+            "state": pull_request.state,
+        },
+    )
+
+
+def _fetch_issue_comments(issue: Issue) -> str:
+    comments = issue.get_comments()
+    return "\nComment: ".join(comment.body for comment in comments)
+
+
+def _convert_issue_to_document(issue: Issue) -> Document:
+    full_context = f"Issue {issue.title}\n{issue.body}"
+    return Document(
+        id=issue.html_url,
+        sections=[Section(link=issue.html_url, text=full_context)],
+        source=DocumentSource.GITHUB,
+        semantic_identifier=issue.title,
+        metadata={
+            "last_modified": issue.updated_at,
+            "state": issue.state,
+        },
+    )
+
+
 class GithubConnector(LoadConnector):
    def __init__(
        self,
@@ -37,40 +75,85 @@ class GithubConnector(LoadConnector):
        repo_name: str,
        batch_size: int = INDEX_BATCH_SIZE,
        state_filter: str = "all",
+        include_prs: bool = True,
+        include_issues: bool = False,
    ) -> None:
        self.repo_owner = repo_owner
        self.repo_name = repo_name
        self.batch_size = batch_size
        self.state_filter = state_filter
+        self.include_prs = include_prs
+        self.include_issues = include_issues
        self.github_client: Github | None = None

    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        self.github_client = Github(credentials["github_access_token"])
        return None

-    def load_from_state(self) -> GenerateDocumentsOutput:
+    def _fetch_from_github(
+        self, start: datetime | None = None, end: datetime | None = None
+    ) -> GenerateDocumentsOutput:
        if self.github_client is None:
            raise ConnectorMissingCredentialError("GitHub")
-        repo = self.github_client.get_repo(f"{self.repo_owner}/{self.repo_name}")
-        pull_requests = repo.get_pulls(state=self.state_filter)
-        for pr_batch in get_pr_batches(pull_requests, self.batch_size):
-            doc_batch = []
-            for pull_request in pr_batch:
-                full_context = f"Pull-Request {pull_request.title}  {pull_request.body}"
-                doc_batch.append(
-                    Document(
-                        id=pull_request.html_url,
-                        sections=[
-                            Section(link=pull_request.html_url, text=full_context)
-                        ],
-                        source=DocumentSource.GITHUB,
-                        semantic_identifier=pull_request.title,
-                        metadata={
-                            "last_modified": pull_request.last_modified,
-                            "merged": pull_request.merged,
-                            "state": pull_request.state,
-                        },
-                    )
-                )

-            yield doc_batch
+        repo = self.github_client.get_repo(f"{self.repo_owner}/{self.repo_name}")
+
+        if self.include_prs:
+            pull_requests = repo.get_pulls(
+                state=self.state_filter, sort="updated", direction="desc"
+            )
+
+            for pr_batch in _batch_github_objects(pull_requests, self.batch_size):
+                doc_batch: list[Document] = []
+                for pr in pr_batch:
+                    if start is not None and pr.updated_at < start:
+                        yield doc_batch
+                        return
+                    if end is not None and pr.updated_at > end:
+                        continue
+                    doc_batch.append(_convert_pr_to_document(cast(PullRequest, pr)))
+                yield doc_batch
+
+        if self.include_issues:
+            issues = repo.get_issues(
+                state=self.state_filter, sort="updated", direction="desc"
+            )
+
+            for issue_batch in _batch_github_objects(issues, self.batch_size):
+                doc_batch = []
+                for issue in issue_batch:
+                    issue = cast(Issue, issue)
+                    if start is not None and issue.updated_at < start:
+                        yield doc_batch
+                        return
+                    if end is not None and issue.updated_at > end:
+                        continue
+                    if issue.pull_request is not None:
+                        # PRs are handled separately
+                        continue
+                    doc_batch.append(_convert_issue_to_document(issue))
+                yield doc_batch
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        return self._fetch_from_github()
+
+    def poll_source(
+        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
+    ) -> GenerateDocumentsOutput:
+        start_datetime = datetime.fromtimestamp(start)
+        end_datetime = datetime.fromtimestamp(end)
+        return self._fetch_from_github(start_datetime, end_datetime)
+
+
+if __name__ == "__main__":
+    import os
+
+    connector = GithubConnector(
+        repo_owner=os.environ["REPO_OWNER"],
+        repo_name=os.environ["REPO_NAME"],
+    )
+    connector.load_credentials(
+        {"github_access_token": os.environ["GITHUB_ACCESS_TOKEN"]}
+    )
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
--- a/backend/danswer/connectors/interfaces.py
+++ b/backend/danswer/connectors/interfaces.py
@@ -1,5 +1,5 @@
 import abc
-from collections.abc import Generator
+from collections.abc import Iterator
 from typing import Any

 from danswer.connectors.models import Document
@@ -7,7 +7,7 @@ from danswer.connectors.models import Document

 SecondsSinceUnixEpoch = float

-GenerateDocumentsOutput = Generator[list[Document], None, None]
+GenerateDocumentsOutput = Iterator[list[Document]]


 class BaseConnector(abc.ABC):