Fix GitLabs CI (#965)

This commit is contained in:
Yuhong Sun
2024-01-18 16:12:46 -08:00
committed by GitHub
parent 1670d923aa
commit 5edc464c9a
6 changed files with 62 additions and 67 deletions

View File

@@ -1,18 +1,20 @@
import itertools import itertools
from collections.abc import Iterable
from collections.abc import Iterator from collections.abc import Iterator
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from typing import Any from typing import Any
import gitlab import gitlab
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo, ConnectorMissingCredentialError from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@@ -22,22 +24,21 @@ logger = setup_logger()
def _batch_gitlab_objects( def _batch_gitlab_objects(
git_objs: list[Any], git_objs: Iterable[Any], batch_size: int
batch_size: int
) -> Iterator[list[Any]]: ) -> Iterator[list[Any]]:
it = iter(git_objs) it = iter(git_objs)
while True: while True:
batch = list(itertools.islice(it, batch_size[0])) batch = list(itertools.islice(it, batch_size))
if not batch: if not batch:
break break
yield batch yield batch
def get_author(author: Any) -> BasicExpertInfo: def get_author(author: Any) -> BasicExpertInfo:
return BasicExpertInfo( return BasicExpertInfo(
display_name=author.get("name"), display_name=author.get("name"),
first_name=author.get("name").split(" ")[0], first_name=author.get("name").split(" ")[0],
last_name=author.get("name").split(" ")[1] last_name=author.get("name").split(" ")[1],
) )
@@ -52,10 +53,7 @@ def _convert_merge_request_to_document(mr: Any) -> Document:
# due to local time discrepancies with UTC # due to local time discrepancies with UTC
doc_updated_at=mr.updated_at.replace(tzinfo=timezone.utc), doc_updated_at=mr.updated_at.replace(tzinfo=timezone.utc),
primary_owners=[get_author(mr.author)], primary_owners=[get_author(mr.author)],
metadata={ metadata={"state": mr.state, "type": "MergeRequest"},
"state": mr.state,
"type": "MergeRequest"
},
) )
@@ -70,14 +68,13 @@ def _convert_issue_to_document(issue: Any) -> Document:
# due to local time discrepancies with UTC # due to local time discrepancies with UTC
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc), doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
primary_owners=[get_author(issue.author)], primary_owners=[get_author(issue.author)],
metadata={ metadata={"state": issue.state, "type": issue.type | "Issue"},
"state": issue.state,
"type": issue.type | "Issue"
},
) )
class GitlabConnector(LoadConnector, PollConnector): class GitlabConnector(LoadConnector, PollConnector):
def __init__(self, def __init__(
self,
project_owner: str, project_owner: str,
project_name: str, project_name: str,
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
@@ -85,26 +82,28 @@ class GitlabConnector(LoadConnector, PollConnector):
include_mrs: bool = True, include_mrs: bool = True,
include_issues: bool = True, include_issues: bool = True,
) -> None: ) -> None:
self.project_owner=project_owner, self.project_owner = project_owner
self.project_name=project_name, self.project_name = project_name
self.batch_size=batch_size, self.batch_size = batch_size
self.state_filter=state_filter, self.state_filter = state_filter
self.include_mrs=include_mrs, self.include_mrs = include_mrs
self.include_issues=include_issues, self.include_issues = include_issues
self.gitlab_client: gitlab.Gitlab | None = None self.gitlab_client: gitlab.Gitlab | None = None
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.gitlab_client = gitlab.Gitlab(credentials["gitlab_url"], private_token=credentials['gitlab_access_token']) self.gitlab_client = gitlab.Gitlab(
credentials["gitlab_url"], private_token=credentials["gitlab_access_token"]
)
return None return None
def _fetch_from_gitlab(
self, start: datetime | None = None, end: datetime | None = None
def _fetch_from_gitlab(self, start: datetime | None = None, end: datetime | None = None) -> GenerateDocumentsOutput: ) -> GenerateDocumentsOutput:
if self.gitlab_client is None: if self.gitlab_client is None:
raise ConnectorMissingCredentialError("Gitlab") raise ConnectorMissingCredentialError("Gitlab")
project = self.gitlab_client.projects.get(f"{self.project_owner[0]}/{self.project_name[0]}") project = self.gitlab_client.projects.get(
f"{self.project_owner[0]}/{self.project_name[0]}"
)
if self.include_mrs: if self.include_mrs:
merge_requests = project.mergerequests.list( merge_requests = project.mergerequests.list(
@@ -112,9 +111,11 @@ class GitlabConnector(LoadConnector, PollConnector):
) )
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size): for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
doc_batch =[] doc_batch: list[Document] = []
for mr in mr_batch: for mr in mr_batch:
mr.updated_at = datetime.strptime(mr.updated_at, "%Y-%m-%dT%H:%M:%S.%fZ") mr.updated_at = datetime.strptime(
mr.updated_at, "%Y-%m-%dT%H:%M:%S.%fZ"
)
if start is not None and mr.updated_at < start: if start is not None and mr.updated_at < start:
yield doc_batch yield doc_batch
return return
@@ -124,14 +125,14 @@ class GitlabConnector(LoadConnector, PollConnector):
yield doc_batch yield doc_batch
if self.include_issues: if self.include_issues:
issues = project.issues.list( issues = project.issues.list(state=self.state_filter)
state=self.state_filter
)
for issue_batch in _batch_gitlab_objects(issues, self.batch_size): for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
doc_batch = [] doc_batch = []
for issue in issue_batch: for issue in issue_batch:
issue.updated_at = datetime.strptime(issue.updated_at, "%Y-%m-%dT%H:%M:%S.%fZ") issue.updated_at = datetime.strptime(
issue.updated_at, "%Y-%m-%dT%H:%M:%S.%fZ"
)
if start is not None and issue.updated_at < start: if start is not None and issue.updated_at < start:
yield doc_batch yield doc_batch
return return
@@ -146,19 +147,17 @@ class GitlabConnector(LoadConnector, PollConnector):
def load_from_state(self) -> GenerateDocumentsOutput: def load_from_state(self) -> GenerateDocumentsOutput:
return self._fetch_from_gitlab() return self._fetch_from_gitlab()
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput: def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
start_datetime = datetime.utcfromtimestamp(start) start_datetime = datetime.utcfromtimestamp(start)
end_datetime = datetime.utcfromtimestamp(end) end_datetime = datetime.utcfromtimestamp(end)
return self._fetch_from_gitlab(start_datetime, end_datetime) return self._fetch_from_gitlab(start_datetime, end_datetime)
if __name__ == "__main__": if __name__ == "__main__":
import os import os
connector = GitlabConnector( connector = GitlabConnector(
# gitlab_url="https://gitlab.com/api/v4", # gitlab_url="https://gitlab.com/api/v4",
project_owner=os.environ["PROJECT_OWNER"], project_owner=os.environ["PROJECT_OWNER"],
@@ -172,11 +171,8 @@ if __name__ == "__main__":
connector.load_credentials( connector.load_credentials(
{ {
"github_access_token": os.environ["GITLAB_ACCESS_TOKEN"], "github_access_token": os.environ["GITLAB_ACCESS_TOKEN"],
"gitlab_url":os.environ["GITLAB_URL"] "gitlab_url": os.environ["GITLAB_URL"],
} }
) )
document_batches = connector.load_from_state() document_batches = connector.load_from_state()
print(next(document_batches)) print(next(document_batches))

View File

@@ -108,7 +108,8 @@ const Main = () => {
formBody={ formBody={
<> <>
<Text> <Text>
If you are using GitLab Cloud, keep the default value below</Text> If you are using GitLab Cloud, keep the default value below
</Text>
<TextFormField <TextFormField
name="gitlab_url" name="gitlab_url"
label="GitLab URL:" label="GitLab URL:"
@@ -131,7 +132,7 @@ const Main = () => {
})} })}
initialValues={{ initialValues={{
gitlab_access_token: "", gitlab_access_token: "",
gitlab_url: "https://gitlab.com" gitlab_url: "https://gitlab.com",
}} }}
onSubmit={(isSuccess) => { onSubmit={(isSuccess) => {
if (isSuccess) { if (isSuccess) {

View File

@@ -356,7 +356,6 @@ export const GithubIcon = ({
); );
}; };
export const GoogleDriveIcon = ({ export const GoogleDriveIcon = ({
size = 16, size = 16,
className = defaultTailwindCSS, className = defaultTailwindCSS,

View File

@@ -85,7 +85,6 @@ export interface GitlabConfig {
include_issues: boolean; include_issues: boolean;
} }
export interface GoogleDriveConfig { export interface GoogleDriveConfig {
folder_paths?: string[]; folder_paths?: string[];
include_shared?: boolean; include_shared?: boolean;
@@ -199,7 +198,7 @@ export interface GithubCredentialJson {
} }
export interface GitlabCredentialJson { export interface GitlabCredentialJson {
gitlab_url:string, gitlab_url: string;
gitlab_access_token: string; gitlab_access_token: string;
} }