Added a Logic to Index entire Gitlab Project (#1586)

* Changes for Gitlab connector

* Changes to Rebase from Main

* Changes to Rebase from Main

* Changes to Rebase from Main

* Changes to Rebase from Main

* made indexing code files a config setting

* Update app_configs.py

created env variable

* Update app_configs.py

added false

---------

Co-authored-by: Varun Gaur <vgaur@roku.com>
Co-authored-by: hagen-danswer <hagen@danswer.ai>
This commit is contained in:
Varun Gaur
2024-06-11 20:18:14 -05:00
committed by GitHub
parent e6d5b95b4a
commit 5a577f9a00
2 changed files with 88 additions and 11 deletions

View File

@@ -188,6 +188,10 @@ GONG_CONNECTOR_START_TIME = os.environ.get("GONG_CONNECTOR_START_TIME")
GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None
GITLAB_CONNECTOR_INCLUDE_CODE_FILES = (
os.environ.get("GITLAB_CONNECTOR_INCLUDE_CODE_FILES", "").lower() == "true"
)
DASK_JOB_CLIENT_ENABLED = ( DASK_JOB_CLIENT_ENABLED = (
os.environ.get("DASK_JOB_CLIENT_ENABLED", "").lower() == "true" os.environ.get("DASK_JOB_CLIENT_ENABLED", "").lower() == "true"
) )

View File

@@ -1,4 +1,6 @@
import fnmatch
import itertools import itertools
from collections import deque
from collections.abc import Iterable from collections.abc import Iterable
from collections.abc import Iterator from collections.abc import Iterator
from datetime import datetime from datetime import datetime
@@ -7,7 +9,9 @@ from typing import Any
import gitlab import gitlab
import pytz import pytz
from gitlab.v4.objects import Project
from danswer.configs.app_configs import GITLAB_CONNECTOR_INCLUDE_CODE_FILES
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
@@ -20,7 +24,13 @@ from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
# List of directories/Files to exclude
exclude_patterns = [
"logs",
".github/",
".gitlab/",
".pre-commit-config.yaml",
]
logger = setup_logger() logger = setup_logger()
@@ -73,6 +83,37 @@ def _convert_issue_to_document(issue: Any) -> Document:
return doc return doc
def _convert_code_to_document(
project: Project, file: Any, url: str, projectName: str, projectOwner: str
) -> Document:
file_content_obj = project.files.get(
file_path=file["path"], ref="master"
) # Replace 'master' with your branch name if needed
try:
file_content = file_content_obj.decode().decode("utf-8")
except UnicodeDecodeError:
file_content = file_content_obj.decode().decode("latin-1")
file_url = f"{url}/{projectOwner}/{projectName}/-/blob/master/{file['path']}" # Construct the file URL
doc = Document(
id=file["id"],
sections=[Section(link=file_url, text=file_content)],
source=DocumentSource.GITLAB,
semantic_identifier=file["name"],
doc_updated_at=datetime.now().replace(
tzinfo=timezone.utc
), # Use current time as updated_at
primary_owners=[], # Fill this as needed
metadata={"type": "CodeFile"},
)
return doc
def _should_exclude(path: str) -> bool:
"""Check if a path matches any of the exclude patterns."""
return any(fnmatch.fnmatch(path, pattern) for pattern in exclude_patterns)
class GitlabConnector(LoadConnector, PollConnector): class GitlabConnector(LoadConnector, PollConnector):
def __init__( def __init__(
self, self,
@@ -82,6 +123,7 @@ class GitlabConnector(LoadConnector, PollConnector):
state_filter: str = "all", state_filter: str = "all",
include_mrs: bool = True, include_mrs: bool = True,
include_issues: bool = True, include_issues: bool = True,
include_code_files: bool = GITLAB_CONNECTOR_INCLUDE_CODE_FILES,
) -> None: ) -> None:
self.project_owner = project_owner self.project_owner = project_owner
self.project_name = project_name self.project_name = project_name
@@ -89,6 +131,7 @@ class GitlabConnector(LoadConnector, PollConnector):
self.state_filter = state_filter self.state_filter = state_filter
self.include_mrs = include_mrs self.include_mrs = include_mrs
self.include_issues = include_issues self.include_issues = include_issues
self.include_code_files = include_code_files
self.gitlab_client: gitlab.Gitlab | None = None self.gitlab_client: gitlab.Gitlab | None = None
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@@ -102,17 +145,46 @@ class GitlabConnector(LoadConnector, PollConnector):
) -> GenerateDocumentsOutput: ) -> GenerateDocumentsOutput:
if self.gitlab_client is None: if self.gitlab_client is None:
raise ConnectorMissingCredentialError("Gitlab") raise ConnectorMissingCredentialError("Gitlab")
project = self.gitlab_client.projects.get( project: gitlab.Project = self.gitlab_client.projects.get(
f"{self.project_owner}/{self.project_name}" f"{self.project_owner}/{self.project_name}"
) )
# Fetch code files
if self.include_code_files:
# Fetching using BFS as project.report_tree with recursion causing slow load
queue = deque([""]) # Start with the root directory
while queue:
current_path = queue.popleft()
files = project.repository_tree(path=current_path, all=True)
for file_batch in _batch_gitlab_objects(files, self.batch_size):
code_doc_batch: list[Document] = []
for file in file_batch:
if _should_exclude(file["path"]):
continue
if file["type"] == "blob":
code_doc_batch.append(
_convert_code_to_document(
project,
file,
self.gitlab_client.url,
self.project_name,
self.project_owner,
)
)
elif file["type"] == "tree":
queue.append(file["path"])
if code_doc_batch:
yield code_doc_batch
if self.include_mrs: if self.include_mrs:
merge_requests = project.mergerequests.list( merge_requests = project.mergerequests.list(
state=self.state_filter, order_by="updated_at", sort="desc" state=self.state_filter, order_by="updated_at", sort="desc"
) )
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size): for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
doc_batch: list[Document] = [] mr_doc_batch: list[Document] = []
for mr in mr_batch: for mr in mr_batch:
mr.updated_at = datetime.strptime( mr.updated_at = datetime.strptime(
mr.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z" mr.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z"
@@ -120,18 +192,18 @@ class GitlabConnector(LoadConnector, PollConnector):
if start is not None and mr.updated_at < start.replace( if start is not None and mr.updated_at < start.replace(
tzinfo=pytz.UTC tzinfo=pytz.UTC
): ):
yield doc_batch yield mr_doc_batch
return return
if end is not None and mr.updated_at > end.replace(tzinfo=pytz.UTC): if end is not None and mr.updated_at > end.replace(tzinfo=pytz.UTC):
continue continue
doc_batch.append(_convert_merge_request_to_document(mr)) mr_doc_batch.append(_convert_merge_request_to_document(mr))
yield doc_batch yield mr_doc_batch
if self.include_issues: if self.include_issues:
issues = project.issues.list(state=self.state_filter) issues = project.issues.list(state=self.state_filter)
for issue_batch in _batch_gitlab_objects(issues, self.batch_size): for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
doc_batch = [] issue_doc_batch: list[Document] = []
for issue in issue_batch: for issue in issue_batch:
issue.updated_at = datetime.strptime( issue.updated_at = datetime.strptime(
issue.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z" issue.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z"
@@ -139,14 +211,14 @@ class GitlabConnector(LoadConnector, PollConnector):
if start is not None: if start is not None:
start = start.replace(tzinfo=pytz.UTC) start = start.replace(tzinfo=pytz.UTC)
if issue.updated_at < start: if issue.updated_at < start:
yield doc_batch yield issue_doc_batch
return return
if end is not None: if end is not None:
end = end.replace(tzinfo=pytz.UTC) end = end.replace(tzinfo=pytz.UTC)
if issue.updated_at > end: if issue.updated_at > end:
continue continue
doc_batch.append(_convert_issue_to_document(issue)) issue_doc_batch.append(_convert_issue_to_document(issue))
yield doc_batch yield issue_doc_batch
def load_from_state(self) -> GenerateDocumentsOutput: def load_from_state(self) -> GenerateDocumentsOutput:
return self._fetch_from_gitlab() return self._fetch_from_gitlab()
@@ -170,11 +242,12 @@ if __name__ == "__main__":
state_filter="all", state_filter="all",
include_mrs=True, include_mrs=True,
include_issues=True, include_issues=True,
include_code_files=GITLAB_CONNECTOR_INCLUDE_CODE_FILES,
) )
connector.load_credentials( connector.load_credentials(
{ {
"github_access_token": os.environ["GITLAB_ACCESS_TOKEN"], "gitlab_access_token": os.environ["GITLAB_ACCESS_TOKEN"],
"gitlab_url": os.environ["GITLAB_URL"], "gitlab_url": os.environ["GITLAB_URL"],
} }
) )