From 5a577f9a00f2cc1cb1a50add022953e8ee1bbd65 Mon Sep 17 00:00:00 2001 From: Varun Gaur Date: Tue, 11 Jun 2024 20:18:14 -0500 Subject: [PATCH] Added a Logic to Index entire Gitlab Project (#1586) * Changes for Gitlab connector * Changes to Rebase from Main * Changes to Rebase from Main * Changes to Rebase from Main * Changes to Rebase from Main * made indexing code files a config setting * Update app_configs.py created env variable * Update app_configs.py added false --------- Co-authored-by: Varun Gaur Co-authored-by: hagen-danswer --- backend/danswer/configs/app_configs.py | 4 + .../danswer/connectors/gitlab/connector.py | 95 ++++++++++++++++--- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 577ef2f0a..77aa4c176 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -188,6 +188,10 @@ GONG_CONNECTOR_START_TIME = os.environ.get("GONG_CONNECTOR_START_TIME") GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None +GITLAB_CONNECTOR_INCLUDE_CODE_FILES = ( + os.environ.get("GITLAB_CONNECTOR_INCLUDE_CODE_FILES", "").lower() == "true" +) + DASK_JOB_CLIENT_ENABLED = ( os.environ.get("DASK_JOB_CLIENT_ENABLED", "").lower() == "true" ) diff --git a/backend/danswer/connectors/gitlab/connector.py b/backend/danswer/connectors/gitlab/connector.py index 55e358bbe..f07baf3e1 100644 --- a/backend/danswer/connectors/gitlab/connector.py +++ b/backend/danswer/connectors/gitlab/connector.py @@ -1,4 +1,6 @@ +import fnmatch import itertools +from collections import deque from collections.abc import Iterable from collections.abc import Iterator from datetime import datetime @@ -7,7 +9,9 @@ from typing import Any import gitlab import pytz +from gitlab.v4.objects import Project +from danswer.configs.app_configs import GITLAB_CONNECTOR_INCLUDE_CODE_FILES from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -20,7 +24,13 @@ from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger - +# List of directories/Files to exclude +exclude_patterns = [ + "logs", + ".github/", + ".gitlab/", + ".pre-commit-config.yaml", +] logger = setup_logger() @@ -73,6 +83,37 @@ def _convert_issue_to_document(issue: Any) -> Document: return doc +def _convert_code_to_document( + project: Project, file: Any, url: str, projectName: str, projectOwner: str +) -> Document: + file_content_obj = project.files.get( + file_path=file["path"], ref="master" + ) # Replace 'master' with your branch name if needed + try: + file_content = file_content_obj.decode().decode("utf-8") + except UnicodeDecodeError: + file_content = file_content_obj.decode().decode("latin-1") + + file_url = f"{url}/{projectOwner}/{projectName}/-/blob/master/{file['path']}" # Construct the file URL + doc = Document( + id=file["id"], + sections=[Section(link=file_url, text=file_content)], + source=DocumentSource.GITLAB, + semantic_identifier=file["name"], + doc_updated_at=datetime.now().replace( + tzinfo=timezone.utc + ), # Use current time as updated_at + primary_owners=[], # Fill this as needed + metadata={"type": "CodeFile"}, + ) + return doc + + +def _should_exclude(path: str) -> bool: + """Check if a path matches any of the exclude patterns.""" + return any(fnmatch.fnmatch(path, pattern) for pattern in exclude_patterns) + + class GitlabConnector(LoadConnector, PollConnector): def __init__( self, @@ -82,6 +123,7 @@ class GitlabConnector(LoadConnector, PollConnector): state_filter: str = "all", include_mrs: bool = True, include_issues: bool = True, + include_code_files: bool = GITLAB_CONNECTOR_INCLUDE_CODE_FILES, ) -> None: self.project_owner = project_owner self.project_name = project_name @@ -89,6 +131,7 @@ class GitlabConnector(LoadConnector, PollConnector): self.state_filter = state_filter self.include_mrs = include_mrs self.include_issues = include_issues + self.include_code_files = include_code_files self.gitlab_client: gitlab.Gitlab | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: @@ -102,17 +145,46 @@ class GitlabConnector(LoadConnector, PollConnector): ) -> GenerateDocumentsOutput: if self.gitlab_client is None: raise ConnectorMissingCredentialError("Gitlab") - project = self.gitlab_client.projects.get( + project: gitlab.Project = self.gitlab_client.projects.get( f"{self.project_owner}/{self.project_name}" ) + # Fetch code files + if self.include_code_files: + # Fetching using BFS as project.report_tree with recursion causing slow load + queue = deque([""]) # Start with the root directory + while queue: + current_path = queue.popleft() + files = project.repository_tree(path=current_path, all=True) + for file_batch in _batch_gitlab_objects(files, self.batch_size): + code_doc_batch: list[Document] = [] + for file in file_batch: + if _should_exclude(file["path"]): + continue + + if file["type"] == "blob": + code_doc_batch.append( + _convert_code_to_document( + project, + file, + self.gitlab_client.url, + self.project_name, + self.project_owner, + ) + ) + elif file["type"] == "tree": + queue.append(file["path"]) + + if code_doc_batch: + yield code_doc_batch + if self.include_mrs: merge_requests = project.mergerequests.list( state=self.state_filter, order_by="updated_at", sort="desc" ) for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size): - doc_batch: list[Document] = [] + mr_doc_batch: list[Document] = [] for mr in mr_batch: mr.updated_at = datetime.strptime( mr.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z" @@ -120,18 +192,18 @@ class GitlabConnector(LoadConnector, PollConnector): if start is not None and mr.updated_at < start.replace( tzinfo=pytz.UTC ): - yield doc_batch + yield mr_doc_batch return if end is not None and mr.updated_at > end.replace(tzinfo=pytz.UTC): continue - doc_batch.append(_convert_merge_request_to_document(mr)) - yield doc_batch + mr_doc_batch.append(_convert_merge_request_to_document(mr)) + yield mr_doc_batch if self.include_issues: issues = project.issues.list(state=self.state_filter) for issue_batch in _batch_gitlab_objects(issues, self.batch_size): - doc_batch = [] + issue_doc_batch: list[Document] = [] for issue in issue_batch: issue.updated_at = datetime.strptime( issue.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z" @@ -139,14 +211,14 @@ class GitlabConnector(LoadConnector, PollConnector): if start is not None: start = start.replace(tzinfo=pytz.UTC) if issue.updated_at < start: - yield doc_batch + yield issue_doc_batch return if end is not None: end = end.replace(tzinfo=pytz.UTC) if issue.updated_at > end: continue - doc_batch.append(_convert_issue_to_document(issue)) - yield doc_batch + issue_doc_batch.append(_convert_issue_to_document(issue)) + yield issue_doc_batch def load_from_state(self) -> GenerateDocumentsOutput: return self._fetch_from_gitlab() @@ -170,11 +242,12 @@ if __name__ == "__main__": state_filter="all", include_mrs=True, include_issues=True, + include_code_files=GITLAB_CONNECTOR_INCLUDE_CODE_FILES, ) connector.load_credentials( { - "github_access_token": os.environ["GITLAB_ACCESS_TOKEN"], + "gitlab_access_token": os.environ["GITLAB_ACCESS_TOKEN"], "gitlab_url": os.environ["GITLAB_URL"], } )