From 0b610502e0f362d15affc6fc022fc0af35598c47 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Thu, 11 May 2023 18:47:32 -0700
Subject: [PATCH] DAN-54 Github PR Connector (#29)

also fixed some mypy stuff as well
---
 backend/danswer/background/update.py          |  2 +-
 backend/danswer/configs/app_configs.py        |  1 +
 backend/danswer/configs/constants.py          |  1 +
 backend/danswer/connectors/github/__init__.py |  0
 backend/danswer/connectors/github/batch.py    | 61 +++++++++++++++++++
 .../semantic_search/semantic_search.py        |  2 +-
 backend/danswer/server/admin.py               |  5 --
 backend/requirements/default.txt              |  1 +
 backend/scripts/ingestion.py                  | 30 ++++++++-
 9 files changed, 93 insertions(+), 10 deletions(-)
 create mode 100644 backend/danswer/connectors/github/__init__.py
 create mode 100644 backend/danswer/connectors/github/batch.py

diff --git a/backend/danswer/background/update.py b/backend/danswer/background/update.py
index f275f9f869..3a0ecd9a76 100755
--- a/backend/danswer/background/update.py
+++ b/backend/danswer/background/update.py
@@ -23,7 +23,7 @@ def _check_should_run(current_time: int, last_pull: int, pull_frequency: int) ->
     return current_time - last_pull > pull_frequency * 60
 
 
-async def run_update():
+async def run_update() -> None:
     logger.info("Running update")
     # TODO (chris): implement a more generic way to run updates
     # so we don't need to edit this file for future connectors
diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py
index 29aa2df994..df2151a290 100644
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -35,6 +35,7 @@ GOOGLE_DRIVE_CREDENTIAL_JSON = os.environ.get("GOOGLE_DRIVE_CREDENTIAL_JSON", ""
 GOOGLE_DRIVE_TOKENS_JSON = os.environ.get("GOOGLE_DRIVE_TOKENS_JSON", "")
 GOOGLE_DRIVE_INCLUDE_SHARED = False
 
+GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN", "")
 
 #####
 # Query Configs
diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
index dbcb76fe66..61882be593 100644
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -17,3 +17,4 @@ class DocumentSource(str, Enum):
     SLACK = "slack"
     WEB = "web"
     GOOGLE_DRIVE = "google_drive"
+    GITHUB = "github"
diff --git a/backend/danswer/connectors/github/__init__.py b/backend/danswer/connectors/github/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/backend/danswer/connectors/github/batch.py b/backend/danswer/connectors/github/batch.py
new file mode 100644
index 0000000000..77a6fd2c21
--- /dev/null
+++ b/backend/danswer/connectors/github/batch.py
@@ -0,0 +1,61 @@
+import itertools
+from collections.abc import Generator
+
+from danswer.configs.app_configs import GITHUB_ACCESS_TOKEN
+from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.models import Document
+from danswer.connectors.models import Section
+from danswer.connectors.type_aliases import BatchLoader
+from danswer.utils.logging import setup_logger
+from github import Github
+
+logger = setup_logger()
+
+github_client = Github(GITHUB_ACCESS_TOKEN)
+
+
+def get_pr_batches(pull_requests, batch_size):
+    it = iter(pull_requests)
+    while True:
+        batch = list(itertools.islice(it, batch_size))
+        if not batch:
+            break
+        yield batch
+
+
+class BatchGithubLoader(BatchLoader):
+    def __init__(
+        self,
+        repo_owner: str,
+        repo_name: str,
+        batch_size: int = INDEX_BATCH_SIZE,
+        state_filter: str = "all",
+    ) -> None:
+        self.repo_owner = repo_owner
+        self.repo_name = repo_name
+        self.batch_size = batch_size
+        self.state_filter = state_filter
+
+    def load(self) -> Generator[list[Document], None, None]:
+        repo = github_client.get_repo(f"{self.repo_owner}/{self.repo_name}")
+        pull_requests = repo.get_pulls(state=self.state_filter)
+        for pr_batch in get_pr_batches(pull_requests, self.batch_size):
+            doc_batch = []
+            for pull_request in pr_batch:
+                full_context = f"Pull-Request {pull_request.title}  {pull_request.body}"
+                doc_batch.append(
+                    Document(
+                        id=pull_request.url,
+                        sections=[Section(link=pull_request.url, text=full_context)],
+                        source=DocumentSource.GITHUB,
+                        semantic_identifier=pull_request.title,
+                        metadata={
+                            "last_modified": pull_request.last_modified,
+                            "merged": pull_request.merged,
+                            "state": pull_request.state,
+                        },
+                    )
+                )
+
+            yield doc_batch
diff --git a/backend/danswer/semantic_search/semantic_search.py b/backend/danswer/semantic_search/semantic_search.py
index aa6cc90af4..134986b3e2 100644
--- a/backend/danswer/semantic_search/semantic_search.py
+++ b/backend/danswer/semantic_search/semantic_search.py
@@ -107,7 +107,7 @@ def retrieve_ranked_documents(
         return None
     ranked_chunks = semantic_reranking(query, top_chunks, filtered_result_set_size)
 
-    top_docs = [ranked_chunk.document_id for ranked_chunk in ranked_chunks]
+    top_docs = [ranked_chunk.source_links["0"] for ranked_chunk in ranked_chunks]
     files_log_msg = f"Top links from semantic search: {', '.join(top_docs)}"
     logger.info(files_log_msg)
 
diff --git a/backend/danswer/server/admin.py b/backend/danswer/server/admin.py
index 91f4332203..e0eec47742 100644
--- a/backend/danswer/server/admin.py
+++ b/backend/danswer/server/admin.py
@@ -34,11 +34,6 @@ def modify_slack_config(slack_config: SlackConfig):
     update_slack_config(slack_config)
 
 
-@router.post("/connectors/slack/auth")
-def modify_slack_config(slack_config: SlackConfig):
-    update_slack_config(slack_config)
-
-
 class WebIndexAttemptRequest(BaseModel):
     url: str
 
diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt
index dd7adc91d6..e3417e5dba 100644
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -15,6 +15,7 @@ Mako==1.2.4
 openai==0.27.6
 playwright==1.32.1
 pydantic==1.10.7
+PyGithub==1.58.2
 PyPDF2==3.0.1
 pytest-playwright==0.3.2
 qdrant-client==1.1.0
diff --git a/backend/scripts/ingestion.py b/backend/scripts/ingestion.py
index 25b35dae98..a875a328c0 100644
--- a/backend/scripts/ingestion.py
+++ b/backend/scripts/ingestion.py
@@ -5,6 +5,7 @@ from danswer.chunking.chunk import Chunker
 from danswer.chunking.chunk import DefaultChunker
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
+from danswer.connectors.github.batch import BatchGithubLoader
 from danswer.connectors.google_drive.batch import BatchGoogleDriveLoader
 from danswer.connectors.slack.batch import BatchSlackLoader
 from danswer.connectors.type_aliases import BatchLoader
@@ -78,9 +79,23 @@ def load_google_drive_batch(qdrant_collection: str):
     )
 
 
+def load_github_batch(owner: str, repo: str, qdrant_collection: str):
+    logger.info("Loading documents from Github.")
+    load_batch(
+        BatchGithubLoader(
+            repo_owner=owner, repo_name=repo, batch_size=INDEX_BATCH_SIZE
+        ),
+        DefaultChunker(),
+        DefaultEmbedder(),
+        QdrantDatastore(collection=qdrant_collection),
+    )
+
+
 class BatchLoadingArgs(argparse.Namespace):
-    slack_export_dir: str
     website_url: str
+    github_owner: str
+    github_repo: str
+    slack_export_dir: str
     qdrant_collection: str
     rebuild_index: bool
 
@@ -91,6 +106,14 @@ if __name__ == "__main__":
         "--website-url",
         default="https://docs.github.com/en/actions",
     )
+    parser.add_argument(
+        "--github-owner",
+        default="danswer-ai",
+    )
+    parser.add_argument(
+        "--github-repo",
+        default="danswer",
+    )
     parser.add_argument(
         "--slack-export-dir",
         default="/Users/chrisweaver/Downloads/test-slack-export",
@@ -109,6 +132,7 @@ if __name__ == "__main__":
     if args.rebuild_index:
         recreate_collection(args.qdrant_collection)
 
-    # load_slack_batch(args.slack_export_dir, args.qdrant_collection)
+    load_slack_batch(args.slack_export_dir, args.qdrant_collection)
     load_web_batch(args.website_url, args.qdrant_collection)
-    # load_google_drive_batch(args.qdrant_collection)
+    load_google_drive_batch(args.qdrant_collection)
+    load_github_batch(args.github_owner, args.github_repo, args.qdrant_collection)