From 9be3da23579e2899036f5773c0142e858bb33b2d Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Mon, 28 Apr 2025 17:42:49 -0700 Subject: [PATCH] Fix gitlab (#4629) * Fix gitlab * Add back assert --- .../workflows/pr-python-connector-tests.yml | 2 + backend/onyx/connectors/gitlab/connector.py | 20 +-- backend/requirements/default.txt | 2 +- backend/requirements/dev.txt | 1 + .../connectors/gitlab/test_gitlab_basic.py | 135 ++++++++++++++++++ web/src/lib/connectors/connectors.tsx | 9 +- 6 files changed, 156 insertions(+), 13 deletions(-) create mode 100644 backend/tests/daily/connectors/gitlab/test_gitlab_basic.py diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml index fe0dbcf566e..e4a0a5204a1 100644 --- a/.github/workflows/pr-python-connector-tests.yml +++ b/.github/workflows/pr-python-connector-tests.yml @@ -55,6 +55,8 @@ env: SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }} # Github ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }} + # Gitlab + GITLAB_ACCESS_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }} # Gitbook GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }} GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }} diff --git a/backend/onyx/connectors/gitlab/connector.py b/backend/onyx/connectors/gitlab/connector.py index f0c4d266ee3..35dc4f705ce 100644 --- a/backend/onyx/connectors/gitlab/connector.py +++ b/backend/onyx/connectors/gitlab/connector.py @@ -6,6 +6,7 @@ from collections.abc import Iterator from datetime import datetime from datetime import timezone from typing import Any +from typing import TypeVar import gitlab import pytz @@ -24,6 +25,8 @@ from onyx.connectors.models import Document from onyx.connectors.models import TextSection from onyx.utils.logger import setup_logger +T = TypeVar("T") + logger = setup_logger() @@ -36,9 +39,7 @@ exclude_patterns = [ ] -def _batch_gitlab_objects( - git_objs: Iterable[Any], batch_size: int -) -> Iterator[list[Any]]: +def _batch_gitlab_objects(git_objs: Iterable[T], batch_size: int) -> Iterator[list[T]]: it = iter(git_objs) while True: batch = list(itertools.islice(it, batch_size)) @@ -154,7 +155,7 @@ class GitlabConnector(LoadConnector, PollConnector): ) -> GenerateDocumentsOutput: if self.gitlab_client is None: raise ConnectorMissingCredentialError("Gitlab") - project: gitlab.Project = self.gitlab_client.projects.get( + project: Project = self.gitlab_client.projects.get( f"{self.project_owner}/{self.project_name}" ) @@ -189,7 +190,10 @@ class GitlabConnector(LoadConnector, PollConnector): if self.include_mrs: merge_requests = project.mergerequests.list( - state=self.state_filter, order_by="updated_at", sort="desc" + state=self.state_filter, + order_by="updated_at", + sort="desc", + iterator=True, ) for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size): @@ -209,7 +213,7 @@ class GitlabConnector(LoadConnector, PollConnector): yield mr_doc_batch if self.include_issues: - issues = project.issues.list(state=self.state_filter) + issues = project.issues.list(state=self.state_filter, iterator=True) for issue_batch in _batch_gitlab_objects(issues, self.batch_size): issue_doc_batch: list[Document] = [] @@ -235,8 +239,8 @@ class GitlabConnector(LoadConnector, PollConnector): def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: - start_datetime = datetime.utcfromtimestamp(start) - end_datetime = datetime.utcfromtimestamp(end) + start_datetime = datetime.fromtimestamp(start, tz=timezone.utc) + end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) return self._fetch_from_gitlab(start_datetime, end_datetime) diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 1e1ccc678d8..e45042f9e33 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -60,7 +60,7 @@ pycryptodome==3.19.1 pydantic==2.8.2 PyGithub==2.5.0 python-dateutil==2.8.2 -python-gitlab==3.9.0 +python-gitlab==5.6.0 python-pptx==0.6.23 pypdf==5.4.0 pytest-mock==3.12.0 diff --git a/backend/requirements/dev.txt b/backend/requirements/dev.txt index ec9557502d7..3355a8bcbc8 100644 --- a/backend/requirements/dev.txt +++ b/backend/requirements/dev.txt @@ -2,6 +2,7 @@ black==25.1.0 boto3-stubs[s3]==1.34.133 celery-types==0.19.0 cohere==5.6.1 +faker==37.1.0 lxml==5.3.0 lxml_html_clean==0.2.2 mypy-extensions==1.0.0 diff --git a/backend/tests/daily/connectors/gitlab/test_gitlab_basic.py b/backend/tests/daily/connectors/gitlab/test_gitlab_basic.py new file mode 100644 index 00000000000..6458aca12af --- /dev/null +++ b/backend/tests/daily/connectors/gitlab/test_gitlab_basic.py @@ -0,0 +1,135 @@ +import itertools +import os + +import pytest + +from onyx.configs.constants import DocumentSource +from onyx.connectors.gitlab.connector import GitlabConnector + + +@pytest.fixture +def gitlab_connector() -> GitlabConnector: + connector = GitlabConnector( + project_owner="onyx2895818", + project_name="onyx", + include_mrs=True, + include_issues=True, + include_code_files=True, # Include code files in the test + ) + # Ensure GITLAB_ACCESS_TOKEN and optionally GITLAB_URL are set in the environment + gitlab_url = os.environ.get("GITLAB_URL", "https://gitlab.com") + gitlab_token = os.environ.get("GITLAB_ACCESS_TOKEN") + + if not gitlab_token: + pytest.skip("GITLAB_ACCESS_TOKEN environment variable not set.") + + connector.load_credentials( + { + "gitlab_access_token": gitlab_token, + "gitlab_url": gitlab_url, + } + ) + return connector + + +def test_gitlab_connector_basic(gitlab_connector: GitlabConnector) -> None: + doc_batches = gitlab_connector.load_from_state() + docs = list(itertools.chain(*doc_batches)) + # Assert right number of docs - Adjust if necessary based on test repo state + assert len(docs) == 79 + + # Find one of each type to validate + validated_mr = False + validated_issue = False + validated_code_file = False + gitlab_base_url = os.environ.get("GITLAB_URL", "https://gitlab.com").split("//")[-1] + project_path = f"{gitlab_connector.project_owner}/{gitlab_connector.project_name}" + + # --- Specific Document Details to Validate --- + target_mr_id = f"https://{gitlab_base_url}/{project_path}/-/merge_requests/1" + target_issue_id = f"https://{gitlab_base_url}/{project_path}/-/issues/2" + target_code_file_semantic_id = "README.md" + # --- + + for doc in docs: + # Verify basic document properties (common to all types) + assert doc.source == DocumentSource.GITLAB + assert doc.secondary_owners is None + assert doc.from_ingestion_api is False + assert doc.additional_info is None + assert isinstance(doc.id, str) + assert doc.metadata is not None + assert "type" in doc.metadata + doc_type = doc.metadata["type"] + + # Verify sections (common structure) + assert len(doc.sections) >= 1 + section = doc.sections[0] + assert isinstance(section.link, str) + assert gitlab_base_url in section.link + assert isinstance(section.text, str) + + # --- Type-specific and Content Validation --- + if doc.id == target_mr_id and doc_type == "MergeRequest": + assert doc.metadata["state"] == "opened" + assert doc.semantic_identifier == "Add awesome feature" + assert section.text == "This MR implements the awesome feature" + assert doc.primary_owners is not None + assert len(doc.primary_owners) == 1 + assert ( + doc.primary_owners[0].display_name == "Test" + ) # Adjust if author changes + assert doc.id == section.link + validated_mr = True + elif doc.id == target_issue_id and doc_type == "ISSUE": + assert doc.metadata["state"] == "opened" + assert doc.semantic_identifier == "Investigate performance issue" + assert ( + section.text + == "Investigate and resolve the performance degradation on endpoint X" + ) + assert doc.primary_owners is not None + assert len(doc.primary_owners) == 1 + assert ( + doc.primary_owners[0].display_name == "Test" + ) # Adjust if author changes + assert doc.id == section.link + validated_issue = True + elif ( + doc.semantic_identifier == target_code_file_semantic_id + and doc_type == "CodeFile" + ): + # ID is a git hash (e.g., 'd177...'), Link is the blob URL + assert doc.id != section.link + assert section.link.endswith("/README.md") + assert "# onyx" in section.text # Check for a known part of the content + # Code files might not have primary owners assigned this way + # assert len(doc.primary_owners) == 0 + validated_code_file = True + + # Generic validation for *any* document of the type if specific one not found yet + elif doc_type == "MergeRequest" and not validated_mr: + assert "state" in doc.metadata + assert gitlab_base_url in doc.id # MR ID should be a URL + assert doc.id == section.link # Link and ID are the same URL + elif doc_type == "ISSUE" and not validated_issue: + assert "state" in doc.metadata + assert gitlab_base_url in doc.id # Issue ID should be a URL + assert doc.id == section.link # Link and ID are the same URL + elif doc_type == "CodeFile" and not validated_code_file: + assert doc.id != section.link # ID is GID/hash, link is blob URL + + # Early exit optimization (optional) + # if validated_mr and validated_issue and validated_code_file: + # break + + # Assert that we found and validated the specific documents + assert ( + validated_mr + ), f"Failed to find and validate the specific MergeRequest ({target_mr_id})." + assert ( + validated_issue + ), f"Failed to find and validate the specific Issue ({target_issue_id})." + assert ( + validated_code_file + ), f"Failed to find and validate the specific CodeFile ({target_code_file_semantic_id})." diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index 81c12c4d27f..032de96db64 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -253,24 +253,25 @@ export const connectorConfigs: Record< name: "project_name", optional: false, }, + ], + advanced_values: [ { type: "checkbox", query: "Include merge requests?", label: "Include MRs", name: "include_mrs", + description: "Index merge requests from repositories", default: true, - hidden: true, }, { type: "checkbox", query: "Include issues?", label: "Include Issues", name: "include_issues", - optional: true, - hidden: true, + description: "Index issues from repositories", + default: true, }, ], - advanced_values: [], }, gitbook: { description: "Configure GitBook connector",