Add ability to index all of Github

This commit is contained in:
Weves
2025-02-24 18:12:01 -08:00
committed by Chris Weaver
parent a99dd05533
commit 5d58a5e3ea
3 changed files with 129 additions and 57 deletions

View File

@@ -124,7 +124,7 @@ class GithubConnector(LoadConnector, PollConnector):
def __init__( def __init__(
self, self,
repo_owner: str, repo_owner: str,
repo_name: str, repo_name: str | None = None,
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
state_filter: str = "all", state_filter: str = "all",
include_prs: bool = True, include_prs: bool = True,
@@ -162,15 +162,42 @@ class GithubConnector(LoadConnector, PollConnector):
_sleep_after_rate_limit_exception(github_client) _sleep_after_rate_limit_exception(github_client)
return self._get_github_repo(github_client, attempt_num + 1) return self._get_github_repo(github_client, attempt_num + 1)
def _get_all_repos(
self, github_client: Github, attempt_num: int = 0
) -> list[Repository.Repository]:
if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
raise RuntimeError(
"Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github"
)
try:
# Try to get organization first
try:
org = github_client.get_organization(self.repo_owner)
return list(org.get_repos())
except GithubException:
# If not an org, try as a user
user = github_client.get_user(self.repo_owner)
return list(user.get_repos())
except RateLimitExceededException:
_sleep_after_rate_limit_exception(github_client)
return self._get_all_repos(github_client, attempt_num + 1)
def _fetch_from_github( def _fetch_from_github(
self, start: datetime | None = None, end: datetime | None = None self, start: datetime | None = None, end: datetime | None = None
) -> GenerateDocumentsOutput: ) -> GenerateDocumentsOutput:
if self.github_client is None: if self.github_client is None:
raise ConnectorMissingCredentialError("GitHub") raise ConnectorMissingCredentialError("GitHub")
repo = self._get_github_repo(self.github_client) repos = (
[self._get_github_repo(self.github_client)]
if self.repo_name
else self._get_all_repos(self.github_client)
)
for repo in repos:
if self.include_prs: if self.include_prs:
logger.info(f"Fetching PRs for repo: {repo.name}")
pull_requests = repo.get_pulls( pull_requests = repo.get_pulls(
state=self.state_filter, sort="updated", direction="desc" state=self.state_filter, sort="updated", direction="desc"
) )
@@ -182,13 +209,14 @@ class GithubConnector(LoadConnector, PollConnector):
for pr in pr_batch: for pr in pr_batch:
if start is not None and pr.updated_at < start: if start is not None and pr.updated_at < start:
yield doc_batch yield doc_batch
return break
if end is not None and pr.updated_at > end: if end is not None and pr.updated_at > end:
continue continue
doc_batch.append(_convert_pr_to_document(cast(PullRequest, pr))) doc_batch.append(_convert_pr_to_document(cast(PullRequest, pr)))
yield doc_batch yield doc_batch
if self.include_issues: if self.include_issues:
logger.info(f"Fetching issues for repo: {repo.name}")
issues = repo.get_issues( issues = repo.get_issues(
state=self.state_filter, sort="updated", direction="desc" state=self.state_filter, sort="updated", direction="desc"
) )
@@ -201,7 +229,7 @@ class GithubConnector(LoadConnector, PollConnector):
issue = cast(Issue, issue) issue = cast(Issue, issue)
if start is not None and issue.updated_at < start: if start is not None and issue.updated_at < start:
yield doc_batch yield doc_batch
return break
if end is not None and issue.updated_at > end: if end is not None and issue.updated_at > end:
continue continue
if issue.pull_request is not None: if issue.pull_request is not None:
@@ -234,16 +262,26 @@ class GithubConnector(LoadConnector, PollConnector):
if self.github_client is None: if self.github_client is None:
raise ConnectorMissingCredentialError("GitHub credentials not loaded.") raise ConnectorMissingCredentialError("GitHub credentials not loaded.")
if not self.repo_owner or not self.repo_name: if not self.repo_owner:
raise ConnectorValidationError( raise ConnectorValidationError(
"Invalid connector settings: 'repo_owner' and 'repo_name' must be provided." "Invalid connector settings: 'repo_owner' must be provided."
) )
try: try:
if self.repo_name:
test_repo = self.github_client.get_repo( test_repo = self.github_client.get_repo(
f"{self.repo_owner}/{self.repo_name}" f"{self.repo_owner}/{self.repo_name}"
) )
test_repo.get_contents("") test_repo.get_contents("")
else:
# Try to get organization first
try:
org = self.github_client.get_organization(self.repo_owner)
org.get_repos().totalCount # Just check if we can access repos
except GithubException:
# If not an org, try as a user
user = self.github_client.get_user(self.repo_owner)
user.get_repos().totalCount # Just check if we can access repos
except RateLimitExceededException: except RateLimitExceededException:
raise UnexpectedError( raise UnexpectedError(
@@ -260,9 +298,14 @@ class GithubConnector(LoadConnector, PollConnector):
"Your GitHub token does not have sufficient permissions for this repository (HTTP 403)." "Your GitHub token does not have sufficient permissions for this repository (HTTP 403)."
) )
elif e.status == 404: elif e.status == 404:
if self.repo_name:
raise ConnectorValidationError( raise ConnectorValidationError(
f"GitHub repository not found with name: {self.repo_owner}/{self.repo_name}" f"GitHub repository not found with name: {self.repo_owner}/{self.repo_name}"
) )
else:
raise ConnectorValidationError(
f"GitHub user or organization not found: {self.repo_owner}"
)
else: else:
raise ConnectorValidationError( raise ConnectorValidationError(
f"Unexpected GitHub error (status={e.status}): {e.data}" f"Unexpected GitHub error (status={e.status}): {e.data}"

View File

@@ -38,7 +38,9 @@ export const ConnectorTitle = ({
const typedConnector = connector as Connector<GithubConfig>; const typedConnector = connector as Connector<GithubConfig>;
additionalMetadata.set( additionalMetadata.set(
"Repo", "Repo",
`${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}` typedConnector.connector_specific_config.repo_name
? `${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}`
: `${typedConnector.connector_specific_config.repo_owner}/*`
); );
} else if (connector.source === "gitlab") { } else if (connector.source === "gitlab") {
const typedConnector = connector as Connector<GitlabConfig>; const typedConnector = connector as Connector<GitlabConfig>;

View File

@@ -170,11 +170,21 @@ export const connectorConfigs: Record<
values: [ values: [
{ {
type: "text", type: "text",
query: "Enter the repository owner:", query: "Enter the GitHub username or organization:",
label: "Repository Owner", label: "Repository Owner",
name: "repo_owner", name: "repo_owner",
optional: false, optional: false,
}, },
{
type: "tab",
name: "github_mode",
label: "What should we index from GitHub?",
optional: true,
tabs: [
{
value: "repo",
label: "Specific Repository",
fields: [
{ {
type: "text", type: "text",
query: "Enter the repository name:", query: "Enter the repository name:",
@@ -182,20 +192,37 @@ export const connectorConfigs: Record<
name: "repo_name", name: "repo_name",
optional: false, optional: false,
}, },
],
},
{
value: "everything",
label: "Everything",
fields: [
{
type: "string_tab",
label: "Everything",
name: "everything",
description:
"This connector will index all repositories the provided credentials have access to!",
},
],
},
],
},
{ {
type: "checkbox", type: "checkbox",
query: "Include pull requests?", query: "Include pull requests?",
label: "Include pull requests?", label: "Include pull requests?",
description: "Index pull requests from this repository", description: "Index pull requests from repositories",
name: "include_prs", name: "include_prs",
optional: true, optional: true,
}, },
{ {
type: "checkbox", type: "checkbox",
query: "Include issues?", query: "Include issues?",
label: "Include Issues", label: "Include Issues?",
name: "include_issues", name: "include_issues",
description: "Index issues from this repository", description: "Index issues from repositories",
optional: true, optional: true,
}, },
], ],