mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-18 05:41:58 +01:00
Added a Logic to Index entire Gitlab Project (#1586)
* Changes for Gitlab connector * Changes to Rebase from Main * Changes to Rebase from Main * Changes to Rebase from Main * Changes to Rebase from Main * made indexing code files a config setting * Update app_configs.py created env variable * Update app_configs.py added false --------- Co-authored-by: Varun Gaur <vgaur@roku.com> Co-authored-by: hagen-danswer <hagen@danswer.ai>
This commit is contained in:
parent
e6d5b95b4a
commit
5a577f9a00
backend/danswer
@ -188,6 +188,10 @@ GONG_CONNECTOR_START_TIME = os.environ.get("GONG_CONNECTOR_START_TIME")
|
||||
|
||||
GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None
|
||||
|
||||
GITLAB_CONNECTOR_INCLUDE_CODE_FILES = (
|
||||
os.environ.get("GITLAB_CONNECTOR_INCLUDE_CODE_FILES", "").lower() == "true"
|
||||
)
|
||||
|
||||
DASK_JOB_CLIENT_ENABLED = (
|
||||
os.environ.get("DASK_JOB_CLIENT_ENABLED", "").lower() == "true"
|
||||
)
|
||||
|
@ -1,4 +1,6 @@
|
||||
import fnmatch
|
||||
import itertools
|
||||
from collections import deque
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
@ -7,7 +9,9 @@ from typing import Any
|
||||
|
||||
import gitlab
|
||||
import pytz
|
||||
from gitlab.v4.objects import Project
|
||||
|
||||
from danswer.configs.app_configs import GITLAB_CONNECTOR_INCLUDE_CODE_FILES
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
@ -20,7 +24,13 @@ from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
|
||||
# List of directories/Files to exclude
|
||||
exclude_patterns = [
|
||||
"logs",
|
||||
".github/",
|
||||
".gitlab/",
|
||||
".pre-commit-config.yaml",
|
||||
]
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@ -73,6 +83,37 @@ def _convert_issue_to_document(issue: Any) -> Document:
|
||||
return doc
|
||||
|
||||
|
||||
def _convert_code_to_document(
|
||||
project: Project, file: Any, url: str, projectName: str, projectOwner: str
|
||||
) -> Document:
|
||||
file_content_obj = project.files.get(
|
||||
file_path=file["path"], ref="master"
|
||||
) # Replace 'master' with your branch name if needed
|
||||
try:
|
||||
file_content = file_content_obj.decode().decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
file_content = file_content_obj.decode().decode("latin-1")
|
||||
|
||||
file_url = f"{url}/{projectOwner}/{projectName}/-/blob/master/{file['path']}" # Construct the file URL
|
||||
doc = Document(
|
||||
id=file["id"],
|
||||
sections=[Section(link=file_url, text=file_content)],
|
||||
source=DocumentSource.GITLAB,
|
||||
semantic_identifier=file["name"],
|
||||
doc_updated_at=datetime.now().replace(
|
||||
tzinfo=timezone.utc
|
||||
), # Use current time as updated_at
|
||||
primary_owners=[], # Fill this as needed
|
||||
metadata={"type": "CodeFile"},
|
||||
)
|
||||
return doc
|
||||
|
||||
|
||||
def _should_exclude(path: str) -> bool:
|
||||
"""Check if a path matches any of the exclude patterns."""
|
||||
return any(fnmatch.fnmatch(path, pattern) for pattern in exclude_patterns)
|
||||
|
||||
|
||||
class GitlabConnector(LoadConnector, PollConnector):
|
||||
def __init__(
|
||||
self,
|
||||
@ -82,6 +123,7 @@ class GitlabConnector(LoadConnector, PollConnector):
|
||||
state_filter: str = "all",
|
||||
include_mrs: bool = True,
|
||||
include_issues: bool = True,
|
||||
include_code_files: bool = GITLAB_CONNECTOR_INCLUDE_CODE_FILES,
|
||||
) -> None:
|
||||
self.project_owner = project_owner
|
||||
self.project_name = project_name
|
||||
@ -89,6 +131,7 @@ class GitlabConnector(LoadConnector, PollConnector):
|
||||
self.state_filter = state_filter
|
||||
self.include_mrs = include_mrs
|
||||
self.include_issues = include_issues
|
||||
self.include_code_files = include_code_files
|
||||
self.gitlab_client: gitlab.Gitlab | None = None
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
@ -102,17 +145,46 @@ class GitlabConnector(LoadConnector, PollConnector):
|
||||
) -> GenerateDocumentsOutput:
|
||||
if self.gitlab_client is None:
|
||||
raise ConnectorMissingCredentialError("Gitlab")
|
||||
project = self.gitlab_client.projects.get(
|
||||
project: gitlab.Project = self.gitlab_client.projects.get(
|
||||
f"{self.project_owner}/{self.project_name}"
|
||||
)
|
||||
|
||||
# Fetch code files
|
||||
if self.include_code_files:
|
||||
# Fetching using BFS as project.report_tree with recursion causing slow load
|
||||
queue = deque([""]) # Start with the root directory
|
||||
while queue:
|
||||
current_path = queue.popleft()
|
||||
files = project.repository_tree(path=current_path, all=True)
|
||||
for file_batch in _batch_gitlab_objects(files, self.batch_size):
|
||||
code_doc_batch: list[Document] = []
|
||||
for file in file_batch:
|
||||
if _should_exclude(file["path"]):
|
||||
continue
|
||||
|
||||
if file["type"] == "blob":
|
||||
code_doc_batch.append(
|
||||
_convert_code_to_document(
|
||||
project,
|
||||
file,
|
||||
self.gitlab_client.url,
|
||||
self.project_name,
|
||||
self.project_owner,
|
||||
)
|
||||
)
|
||||
elif file["type"] == "tree":
|
||||
queue.append(file["path"])
|
||||
|
||||
if code_doc_batch:
|
||||
yield code_doc_batch
|
||||
|
||||
if self.include_mrs:
|
||||
merge_requests = project.mergerequests.list(
|
||||
state=self.state_filter, order_by="updated_at", sort="desc"
|
||||
)
|
||||
|
||||
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
|
||||
doc_batch: list[Document] = []
|
||||
mr_doc_batch: list[Document] = []
|
||||
for mr in mr_batch:
|
||||
mr.updated_at = datetime.strptime(
|
||||
mr.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z"
|
||||
@ -120,18 +192,18 @@ class GitlabConnector(LoadConnector, PollConnector):
|
||||
if start is not None and mr.updated_at < start.replace(
|
||||
tzinfo=pytz.UTC
|
||||
):
|
||||
yield doc_batch
|
||||
yield mr_doc_batch
|
||||
return
|
||||
if end is not None and mr.updated_at > end.replace(tzinfo=pytz.UTC):
|
||||
continue
|
||||
doc_batch.append(_convert_merge_request_to_document(mr))
|
||||
yield doc_batch
|
||||
mr_doc_batch.append(_convert_merge_request_to_document(mr))
|
||||
yield mr_doc_batch
|
||||
|
||||
if self.include_issues:
|
||||
issues = project.issues.list(state=self.state_filter)
|
||||
|
||||
for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
|
||||
doc_batch = []
|
||||
issue_doc_batch: list[Document] = []
|
||||
for issue in issue_batch:
|
||||
issue.updated_at = datetime.strptime(
|
||||
issue.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z"
|
||||
@ -139,14 +211,14 @@ class GitlabConnector(LoadConnector, PollConnector):
|
||||
if start is not None:
|
||||
start = start.replace(tzinfo=pytz.UTC)
|
||||
if issue.updated_at < start:
|
||||
yield doc_batch
|
||||
yield issue_doc_batch
|
||||
return
|
||||
if end is not None:
|
||||
end = end.replace(tzinfo=pytz.UTC)
|
||||
if issue.updated_at > end:
|
||||
continue
|
||||
doc_batch.append(_convert_issue_to_document(issue))
|
||||
yield doc_batch
|
||||
issue_doc_batch.append(_convert_issue_to_document(issue))
|
||||
yield issue_doc_batch
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
return self._fetch_from_gitlab()
|
||||
@ -170,11 +242,12 @@ if __name__ == "__main__":
|
||||
state_filter="all",
|
||||
include_mrs=True,
|
||||
include_issues=True,
|
||||
include_code_files=GITLAB_CONNECTOR_INCLUDE_CODE_FILES,
|
||||
)
|
||||
|
||||
connector.load_credentials(
|
||||
{
|
||||
"github_access_token": os.environ["GITLAB_ACCESS_TOKEN"],
|
||||
"gitlab_access_token": os.environ["GITLAB_ACCESS_TOKEN"],
|
||||
"gitlab_url": os.environ["GITLAB_URL"],
|
||||
}
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user