mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-25 04:13:25 +02:00
Add ability to index all of Github
This commit is contained in:
@@ -124,7 +124,7 @@ class GithubConnector(LoadConnector, PollConnector):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
repo_owner: str,
|
repo_owner: str,
|
||||||
repo_name: str,
|
repo_name: str | None = None,
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
state_filter: str = "all",
|
state_filter: str = "all",
|
||||||
include_prs: bool = True,
|
include_prs: bool = True,
|
||||||
@@ -162,15 +162,42 @@ class GithubConnector(LoadConnector, PollConnector):
|
|||||||
_sleep_after_rate_limit_exception(github_client)
|
_sleep_after_rate_limit_exception(github_client)
|
||||||
return self._get_github_repo(github_client, attempt_num + 1)
|
return self._get_github_repo(github_client, attempt_num + 1)
|
||||||
|
|
||||||
|
def _get_all_repos(
|
||||||
|
self, github_client: Github, attempt_num: int = 0
|
||||||
|
) -> list[Repository.Repository]:
|
||||||
|
if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Try to get organization first
|
||||||
|
try:
|
||||||
|
org = github_client.get_organization(self.repo_owner)
|
||||||
|
return list(org.get_repos())
|
||||||
|
except GithubException:
|
||||||
|
# If not an org, try as a user
|
||||||
|
user = github_client.get_user(self.repo_owner)
|
||||||
|
return list(user.get_repos())
|
||||||
|
except RateLimitExceededException:
|
||||||
|
_sleep_after_rate_limit_exception(github_client)
|
||||||
|
return self._get_all_repos(github_client, attempt_num + 1)
|
||||||
|
|
||||||
def _fetch_from_github(
|
def _fetch_from_github(
|
||||||
self, start: datetime | None = None, end: datetime | None = None
|
self, start: datetime | None = None, end: datetime | None = None
|
||||||
) -> GenerateDocumentsOutput:
|
) -> GenerateDocumentsOutput:
|
||||||
if self.github_client is None:
|
if self.github_client is None:
|
||||||
raise ConnectorMissingCredentialError("GitHub")
|
raise ConnectorMissingCredentialError("GitHub")
|
||||||
|
|
||||||
repo = self._get_github_repo(self.github_client)
|
repos = (
|
||||||
|
[self._get_github_repo(self.github_client)]
|
||||||
|
if self.repo_name
|
||||||
|
else self._get_all_repos(self.github_client)
|
||||||
|
)
|
||||||
|
|
||||||
|
for repo in repos:
|
||||||
if self.include_prs:
|
if self.include_prs:
|
||||||
|
logger.info(f"Fetching PRs for repo: {repo.name}")
|
||||||
pull_requests = repo.get_pulls(
|
pull_requests = repo.get_pulls(
|
||||||
state=self.state_filter, sort="updated", direction="desc"
|
state=self.state_filter, sort="updated", direction="desc"
|
||||||
)
|
)
|
||||||
@@ -182,13 +209,14 @@ class GithubConnector(LoadConnector, PollConnector):
|
|||||||
for pr in pr_batch:
|
for pr in pr_batch:
|
||||||
if start is not None and pr.updated_at < start:
|
if start is not None and pr.updated_at < start:
|
||||||
yield doc_batch
|
yield doc_batch
|
||||||
return
|
break
|
||||||
if end is not None and pr.updated_at > end:
|
if end is not None and pr.updated_at > end:
|
||||||
continue
|
continue
|
||||||
doc_batch.append(_convert_pr_to_document(cast(PullRequest, pr)))
|
doc_batch.append(_convert_pr_to_document(cast(PullRequest, pr)))
|
||||||
yield doc_batch
|
yield doc_batch
|
||||||
|
|
||||||
if self.include_issues:
|
if self.include_issues:
|
||||||
|
logger.info(f"Fetching issues for repo: {repo.name}")
|
||||||
issues = repo.get_issues(
|
issues = repo.get_issues(
|
||||||
state=self.state_filter, sort="updated", direction="desc"
|
state=self.state_filter, sort="updated", direction="desc"
|
||||||
)
|
)
|
||||||
@@ -201,7 +229,7 @@ class GithubConnector(LoadConnector, PollConnector):
|
|||||||
issue = cast(Issue, issue)
|
issue = cast(Issue, issue)
|
||||||
if start is not None and issue.updated_at < start:
|
if start is not None and issue.updated_at < start:
|
||||||
yield doc_batch
|
yield doc_batch
|
||||||
return
|
break
|
||||||
if end is not None and issue.updated_at > end:
|
if end is not None and issue.updated_at > end:
|
||||||
continue
|
continue
|
||||||
if issue.pull_request is not None:
|
if issue.pull_request is not None:
|
||||||
@@ -234,16 +262,26 @@ class GithubConnector(LoadConnector, PollConnector):
|
|||||||
if self.github_client is None:
|
if self.github_client is None:
|
||||||
raise ConnectorMissingCredentialError("GitHub credentials not loaded.")
|
raise ConnectorMissingCredentialError("GitHub credentials not loaded.")
|
||||||
|
|
||||||
if not self.repo_owner or not self.repo_name:
|
if not self.repo_owner:
|
||||||
raise ConnectorValidationError(
|
raise ConnectorValidationError(
|
||||||
"Invalid connector settings: 'repo_owner' and 'repo_name' must be provided."
|
"Invalid connector settings: 'repo_owner' must be provided."
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
if self.repo_name:
|
||||||
test_repo = self.github_client.get_repo(
|
test_repo = self.github_client.get_repo(
|
||||||
f"{self.repo_owner}/{self.repo_name}"
|
f"{self.repo_owner}/{self.repo_name}"
|
||||||
)
|
)
|
||||||
test_repo.get_contents("")
|
test_repo.get_contents("")
|
||||||
|
else:
|
||||||
|
# Try to get organization first
|
||||||
|
try:
|
||||||
|
org = self.github_client.get_organization(self.repo_owner)
|
||||||
|
org.get_repos().totalCount # Just check if we can access repos
|
||||||
|
except GithubException:
|
||||||
|
# If not an org, try as a user
|
||||||
|
user = self.github_client.get_user(self.repo_owner)
|
||||||
|
user.get_repos().totalCount # Just check if we can access repos
|
||||||
|
|
||||||
except RateLimitExceededException:
|
except RateLimitExceededException:
|
||||||
raise UnexpectedError(
|
raise UnexpectedError(
|
||||||
@@ -260,9 +298,14 @@ class GithubConnector(LoadConnector, PollConnector):
|
|||||||
"Your GitHub token does not have sufficient permissions for this repository (HTTP 403)."
|
"Your GitHub token does not have sufficient permissions for this repository (HTTP 403)."
|
||||||
)
|
)
|
||||||
elif e.status == 404:
|
elif e.status == 404:
|
||||||
|
if self.repo_name:
|
||||||
raise ConnectorValidationError(
|
raise ConnectorValidationError(
|
||||||
f"GitHub repository not found with name: {self.repo_owner}/{self.repo_name}"
|
f"GitHub repository not found with name: {self.repo_owner}/{self.repo_name}"
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
raise ConnectorValidationError(
|
||||||
|
f"GitHub user or organization not found: {self.repo_owner}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ConnectorValidationError(
|
raise ConnectorValidationError(
|
||||||
f"Unexpected GitHub error (status={e.status}): {e.data}"
|
f"Unexpected GitHub error (status={e.status}): {e.data}"
|
||||||
|
@@ -38,7 +38,9 @@ export const ConnectorTitle = ({
|
|||||||
const typedConnector = connector as Connector<GithubConfig>;
|
const typedConnector = connector as Connector<GithubConfig>;
|
||||||
additionalMetadata.set(
|
additionalMetadata.set(
|
||||||
"Repo",
|
"Repo",
|
||||||
`${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}`
|
typedConnector.connector_specific_config.repo_name
|
||||||
|
? `${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}`
|
||||||
|
: `${typedConnector.connector_specific_config.repo_owner}/*`
|
||||||
);
|
);
|
||||||
} else if (connector.source === "gitlab") {
|
} else if (connector.source === "gitlab") {
|
||||||
const typedConnector = connector as Connector<GitlabConfig>;
|
const typedConnector = connector as Connector<GitlabConfig>;
|
||||||
|
@@ -170,11 +170,21 @@ export const connectorConfigs: Record<
|
|||||||
values: [
|
values: [
|
||||||
{
|
{
|
||||||
type: "text",
|
type: "text",
|
||||||
query: "Enter the repository owner:",
|
query: "Enter the GitHub username or organization:",
|
||||||
label: "Repository Owner",
|
label: "Repository Owner",
|
||||||
name: "repo_owner",
|
name: "repo_owner",
|
||||||
optional: false,
|
optional: false,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
type: "tab",
|
||||||
|
name: "github_mode",
|
||||||
|
label: "What should we index from GitHub?",
|
||||||
|
optional: true,
|
||||||
|
tabs: [
|
||||||
|
{
|
||||||
|
value: "repo",
|
||||||
|
label: "Specific Repository",
|
||||||
|
fields: [
|
||||||
{
|
{
|
||||||
type: "text",
|
type: "text",
|
||||||
query: "Enter the repository name:",
|
query: "Enter the repository name:",
|
||||||
@@ -182,20 +192,37 @@ export const connectorConfigs: Record<
|
|||||||
name: "repo_name",
|
name: "repo_name",
|
||||||
optional: false,
|
optional: false,
|
||||||
},
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: "everything",
|
||||||
|
label: "Everything",
|
||||||
|
fields: [
|
||||||
|
{
|
||||||
|
type: "string_tab",
|
||||||
|
label: "Everything",
|
||||||
|
name: "everything",
|
||||||
|
description:
|
||||||
|
"This connector will index all repositories the provided credentials have access to!",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
{
|
{
|
||||||
type: "checkbox",
|
type: "checkbox",
|
||||||
query: "Include pull requests?",
|
query: "Include pull requests?",
|
||||||
label: "Include pull requests?",
|
label: "Include pull requests?",
|
||||||
description: "Index pull requests from this repository",
|
description: "Index pull requests from repositories",
|
||||||
name: "include_prs",
|
name: "include_prs",
|
||||||
optional: true,
|
optional: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "checkbox",
|
type: "checkbox",
|
||||||
query: "Include issues?",
|
query: "Include issues?",
|
||||||
label: "Include Issues",
|
label: "Include Issues?",
|
||||||
name: "include_issues",
|
name: "include_issues",
|
||||||
description: "Index issues from this repository",
|
description: "Index issues from repositories",
|
||||||
optional: true,
|
optional: true,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
Reference in New Issue
Block a user