From 0b610502e0f362d15affc6fc022fc0af35598c47 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 11 May 2023 18:47:32 -0700 Subject: [PATCH] DAN-54 Github PR Connector (#29) also fixed some mypy stuff as well --- backend/danswer/background/update.py | 2 +- backend/danswer/configs/app_configs.py | 1 + backend/danswer/configs/constants.py | 1 + backend/danswer/connectors/github/__init__.py | 0 backend/danswer/connectors/github/batch.py | 61 +++++++++++++++++++ .../semantic_search/semantic_search.py | 2 +- backend/danswer/server/admin.py | 5 -- backend/requirements/default.txt | 1 + backend/scripts/ingestion.py | 30 ++++++++- 9 files changed, 93 insertions(+), 10 deletions(-) create mode 100644 backend/danswer/connectors/github/__init__.py create mode 100644 backend/danswer/connectors/github/batch.py diff --git a/backend/danswer/background/update.py b/backend/danswer/background/update.py index f275f9f86..3a0ecd9a7 100755 --- a/backend/danswer/background/update.py +++ b/backend/danswer/background/update.py @@ -23,7 +23,7 @@ def _check_should_run(current_time: int, last_pull: int, pull_frequency: int) -> return current_time - last_pull > pull_frequency * 60 -async def run_update(): +async def run_update() -> None: logger.info("Running update") # TODO (chris): implement a more generic way to run updates # so we don't need to edit this file for future connectors diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 29aa2df99..df2151a29 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -35,6 +35,7 @@ GOOGLE_DRIVE_CREDENTIAL_JSON = os.environ.get("GOOGLE_DRIVE_CREDENTIAL_JSON", "" GOOGLE_DRIVE_TOKENS_JSON = os.environ.get("GOOGLE_DRIVE_TOKENS_JSON", "") GOOGLE_DRIVE_INCLUDE_SHARED = False +GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN", "") ##### # Query Configs diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index dbcb76fe6..61882be59 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -17,3 +17,4 @@ class DocumentSource(str, Enum): SLACK = "slack" WEB = "web" GOOGLE_DRIVE = "google_drive" + GITHUB = "github" diff --git a/backend/danswer/connectors/github/__init__.py b/backend/danswer/connectors/github/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/connectors/github/batch.py b/backend/danswer/connectors/github/batch.py new file mode 100644 index 000000000..77a6fd2c2 --- /dev/null +++ b/backend/danswer/connectors/github/batch.py @@ -0,0 +1,61 @@ +import itertools +from collections.abc import Generator + +from danswer.configs.app_configs import GITHUB_ACCESS_TOKEN +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.connectors.type_aliases import BatchLoader +from danswer.utils.logging import setup_logger +from github import Github + +logger = setup_logger() + +github_client = Github(GITHUB_ACCESS_TOKEN) + + +def get_pr_batches(pull_requests, batch_size): + it = iter(pull_requests) + while True: + batch = list(itertools.islice(it, batch_size)) + if not batch: + break + yield batch + + +class BatchGithubLoader(BatchLoader): + def __init__( + self, + repo_owner: str, + repo_name: str, + batch_size: int = INDEX_BATCH_SIZE, + state_filter: str = "all", + ) -> None: + self.repo_owner = repo_owner + self.repo_name = repo_name + self.batch_size = batch_size + self.state_filter = state_filter + + def load(self) -> Generator[list[Document], None, None]: + repo = github_client.get_repo(f"{self.repo_owner}/{self.repo_name}") + pull_requests = repo.get_pulls(state=self.state_filter) + for pr_batch in get_pr_batches(pull_requests, self.batch_size): + doc_batch = [] + for pull_request in pr_batch: + full_context = f"Pull-Request {pull_request.title} {pull_request.body}" + doc_batch.append( + Document( + id=pull_request.url, + sections=[Section(link=pull_request.url, text=full_context)], + source=DocumentSource.GITHUB, + semantic_identifier=pull_request.title, + metadata={ + "last_modified": pull_request.last_modified, + "merged": pull_request.merged, + "state": pull_request.state, + }, + ) + ) + + yield doc_batch diff --git a/backend/danswer/semantic_search/semantic_search.py b/backend/danswer/semantic_search/semantic_search.py index aa6cc90af..134986b3e 100644 --- a/backend/danswer/semantic_search/semantic_search.py +++ b/backend/danswer/semantic_search/semantic_search.py @@ -107,7 +107,7 @@ def retrieve_ranked_documents( return None ranked_chunks = semantic_reranking(query, top_chunks, filtered_result_set_size) - top_docs = [ranked_chunk.document_id for ranked_chunk in ranked_chunks] + top_docs = [ranked_chunk.source_links["0"] for ranked_chunk in ranked_chunks] files_log_msg = f"Top links from semantic search: {', '.join(top_docs)}" logger.info(files_log_msg) diff --git a/backend/danswer/server/admin.py b/backend/danswer/server/admin.py index 91f433220..e0eec4774 100644 --- a/backend/danswer/server/admin.py +++ b/backend/danswer/server/admin.py @@ -34,11 +34,6 @@ def modify_slack_config(slack_config: SlackConfig): update_slack_config(slack_config) -@router.post("/connectors/slack/auth") -def modify_slack_config(slack_config: SlackConfig): - update_slack_config(slack_config) - - class WebIndexAttemptRequest(BaseModel): url: str diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index dd7adc91d..e3417e5db 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -15,6 +15,7 @@ Mako==1.2.4 openai==0.27.6 playwright==1.32.1 pydantic==1.10.7 +PyGithub==1.58.2 PyPDF2==3.0.1 pytest-playwright==0.3.2 qdrant-client==1.1.0 diff --git a/backend/scripts/ingestion.py b/backend/scripts/ingestion.py index 25b35dae9..a875a328c 100644 --- a/backend/scripts/ingestion.py +++ b/backend/scripts/ingestion.py @@ -5,6 +5,7 @@ from danswer.chunking.chunk import Chunker from danswer.chunking.chunk import DefaultChunker from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION +from danswer.connectors.github.batch import BatchGithubLoader from danswer.connectors.google_drive.batch import BatchGoogleDriveLoader from danswer.connectors.slack.batch import BatchSlackLoader from danswer.connectors.type_aliases import BatchLoader @@ -78,9 +79,23 @@ def load_google_drive_batch(qdrant_collection: str): ) +def load_github_batch(owner: str, repo: str, qdrant_collection: str): + logger.info("Loading documents from Github.") + load_batch( + BatchGithubLoader( + repo_owner=owner, repo_name=repo, batch_size=INDEX_BATCH_SIZE + ), + DefaultChunker(), + DefaultEmbedder(), + QdrantDatastore(collection=qdrant_collection), + ) + + class BatchLoadingArgs(argparse.Namespace): - slack_export_dir: str website_url: str + github_owner: str + github_repo: str + slack_export_dir: str qdrant_collection: str rebuild_index: bool @@ -91,6 +106,14 @@ if __name__ == "__main__": "--website-url", default="https://docs.github.com/en/actions", ) + parser.add_argument( + "--github-owner", + default="danswer-ai", + ) + parser.add_argument( + "--github-repo", + default="danswer", + ) parser.add_argument( "--slack-export-dir", default="/Users/chrisweaver/Downloads/test-slack-export", @@ -109,6 +132,7 @@ if __name__ == "__main__": if args.rebuild_index: recreate_collection(args.qdrant_collection) - # load_slack_batch(args.slack_export_dir, args.qdrant_collection) + load_slack_batch(args.slack_export_dir, args.qdrant_collection) load_web_batch(args.website_url, args.qdrant_collection) - # load_google_drive_batch(args.qdrant_collection) + load_google_drive_batch(args.qdrant_collection) + load_github_batch(args.github_owner, args.github_repo, args.qdrant_collection)