From 5d58a5e3ea7dbb3dc73f56e7b2734d0a0b162be7 Mon Sep 17 00:00:00 2001 From: Weves Date: Mon, 24 Feb 2025 18:12:01 -0800 Subject: [PATCH] Add ability to index all of Github --- backend/onyx/connectors/github/connector.py | 137 ++++++++++++------ .../admin/connectors/ConnectorTitle.tsx | 4 +- web/src/lib/connectors/connectors.tsx | 45 ++++-- 3 files changed, 129 insertions(+), 57 deletions(-) diff --git a/backend/onyx/connectors/github/connector.py b/backend/onyx/connectors/github/connector.py index 531fc36a1..e486eeb26 100644 --- a/backend/onyx/connectors/github/connector.py +++ b/backend/onyx/connectors/github/connector.py @@ -124,7 +124,7 @@ class GithubConnector(LoadConnector, PollConnector): def __init__( self, repo_owner: str, - repo_name: str, + repo_name: str | None = None, batch_size: int = INDEX_BATCH_SIZE, state_filter: str = "all", include_prs: bool = True, @@ -162,53 +162,81 @@ class GithubConnector(LoadConnector, PollConnector): _sleep_after_rate_limit_exception(github_client) return self._get_github_repo(github_client, attempt_num + 1) + def _get_all_repos( + self, github_client: Github, attempt_num: int = 0 + ) -> list[Repository.Repository]: + if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: + raise RuntimeError( + "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github" + ) + + try: + # Try to get organization first + try: + org = github_client.get_organization(self.repo_owner) + return list(org.get_repos()) + except GithubException: + # If not an org, try as a user + user = github_client.get_user(self.repo_owner) + return list(user.get_repos()) + except RateLimitExceededException: + _sleep_after_rate_limit_exception(github_client) + return self._get_all_repos(github_client, attempt_num + 1) + def _fetch_from_github( self, start: datetime | None = None, end: datetime | None = None ) -> GenerateDocumentsOutput: if self.github_client is None: raise ConnectorMissingCredentialError("GitHub") - repo = self._get_github_repo(self.github_client) + repos = ( + [self._get_github_repo(self.github_client)] + if self.repo_name + else self._get_all_repos(self.github_client) + ) - if self.include_prs: - pull_requests = repo.get_pulls( - state=self.state_filter, sort="updated", direction="desc" - ) + for repo in repos: + if self.include_prs: + logger.info(f"Fetching PRs for repo: {repo.name}") + pull_requests = repo.get_pulls( + state=self.state_filter, sort="updated", direction="desc" + ) - for pr_batch in _batch_github_objects( - pull_requests, self.github_client, self.batch_size - ): - doc_batch: list[Document] = [] - for pr in pr_batch: - if start is not None and pr.updated_at < start: - yield doc_batch - return - if end is not None and pr.updated_at > end: - continue - doc_batch.append(_convert_pr_to_document(cast(PullRequest, pr))) - yield doc_batch + for pr_batch in _batch_github_objects( + pull_requests, self.github_client, self.batch_size + ): + doc_batch: list[Document] = [] + for pr in pr_batch: + if start is not None and pr.updated_at < start: + yield doc_batch + break + if end is not None and pr.updated_at > end: + continue + doc_batch.append(_convert_pr_to_document(cast(PullRequest, pr))) + yield doc_batch - if self.include_issues: - issues = repo.get_issues( - state=self.state_filter, sort="updated", direction="desc" - ) + if self.include_issues: + logger.info(f"Fetching issues for repo: {repo.name}") + issues = repo.get_issues( + state=self.state_filter, sort="updated", direction="desc" + ) - for issue_batch in _batch_github_objects( - issues, self.github_client, self.batch_size - ): - doc_batch = [] - for issue in issue_batch: - issue = cast(Issue, issue) - if start is not None and issue.updated_at < start: - yield doc_batch - return - if end is not None and issue.updated_at > end: - continue - if issue.pull_request is not None: - # PRs are handled separately - continue - doc_batch.append(_convert_issue_to_document(issue)) - yield doc_batch + for issue_batch in _batch_github_objects( + issues, self.github_client, self.batch_size + ): + doc_batch = [] + for issue in issue_batch: + issue = cast(Issue, issue) + if start is not None and issue.updated_at < start: + yield doc_batch + break + if end is not None and issue.updated_at > end: + continue + if issue.pull_request is not None: + # PRs are handled separately + continue + doc_batch.append(_convert_issue_to_document(issue)) + yield doc_batch def load_from_state(self) -> GenerateDocumentsOutput: return self._fetch_from_github() @@ -234,16 +262,26 @@ class GithubConnector(LoadConnector, PollConnector): if self.github_client is None: raise ConnectorMissingCredentialError("GitHub credentials not loaded.") - if not self.repo_owner or not self.repo_name: + if not self.repo_owner: raise ConnectorValidationError( - "Invalid connector settings: 'repo_owner' and 'repo_name' must be provided." + "Invalid connector settings: 'repo_owner' must be provided." ) try: - test_repo = self.github_client.get_repo( - f"{self.repo_owner}/{self.repo_name}" - ) - test_repo.get_contents("") + if self.repo_name: + test_repo = self.github_client.get_repo( + f"{self.repo_owner}/{self.repo_name}" + ) + test_repo.get_contents("") + else: + # Try to get organization first + try: + org = self.github_client.get_organization(self.repo_owner) + org.get_repos().totalCount # Just check if we can access repos + except GithubException: + # If not an org, try as a user + user = self.github_client.get_user(self.repo_owner) + user.get_repos().totalCount # Just check if we can access repos except RateLimitExceededException: raise UnexpectedError( @@ -260,9 +298,14 @@ class GithubConnector(LoadConnector, PollConnector): "Your GitHub token does not have sufficient permissions for this repository (HTTP 403)." ) elif e.status == 404: - raise ConnectorValidationError( - f"GitHub repository not found with name: {self.repo_owner}/{self.repo_name}" - ) + if self.repo_name: + raise ConnectorValidationError( + f"GitHub repository not found with name: {self.repo_owner}/{self.repo_name}" + ) + else: + raise ConnectorValidationError( + f"GitHub user or organization not found: {self.repo_owner}" + ) else: raise ConnectorValidationError( f"Unexpected GitHub error (status={e.status}): {e.data}" diff --git a/web/src/components/admin/connectors/ConnectorTitle.tsx b/web/src/components/admin/connectors/ConnectorTitle.tsx index d75518722..2b947644a 100644 --- a/web/src/components/admin/connectors/ConnectorTitle.tsx +++ b/web/src/components/admin/connectors/ConnectorTitle.tsx @@ -38,7 +38,9 @@ export const ConnectorTitle = ({ const typedConnector = connector as Connector; additionalMetadata.set( "Repo", - `${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}` + typedConnector.connector_specific_config.repo_name + ? `${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}` + : `${typedConnector.connector_specific_config.repo_owner}/*` ); } else if (connector.source === "gitlab") { const typedConnector = connector as Connector; diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index da4dd54a3..37cb373b9 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -170,32 +170,59 @@ export const connectorConfigs: Record< values: [ { type: "text", - query: "Enter the repository owner:", + query: "Enter the GitHub username or organization:", label: "Repository Owner", name: "repo_owner", optional: false, }, { - type: "text", - query: "Enter the repository name:", - label: "Repository Name", - name: "repo_name", - optional: false, + type: "tab", + name: "github_mode", + label: "What should we index from GitHub?", + optional: true, + tabs: [ + { + value: "repo", + label: "Specific Repository", + fields: [ + { + type: "text", + query: "Enter the repository name:", + label: "Repository Name", + name: "repo_name", + optional: false, + }, + ], + }, + { + value: "everything", + label: "Everything", + fields: [ + { + type: "string_tab", + label: "Everything", + name: "everything", + description: + "This connector will index all repositories the provided credentials have access to!", + }, + ], + }, + ], }, { type: "checkbox", query: "Include pull requests?", label: "Include pull requests?", - description: "Index pull requests from this repository", + description: "Index pull requests from repositories", name: "include_prs", optional: true, }, { type: "checkbox", query: "Include issues?", - label: "Include Issues", + label: "Include Issues?", name: "include_issues", - description: "Index issues from this repository", + description: "Index issues from repositories", optional: true, }, ],