Fix gitlab (#4629)

* Fix gitlab

* Add back assert
This commit is contained in:
Chris Weaver
2025-04-28 17:42:49 -07:00
committed by GitHub
parent 8961a3cc72
commit 9be3da2357
6 changed files with 156 additions and 13 deletions

View File

@@ -55,6 +55,8 @@ env:
SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }} SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
# Github # Github
ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }} ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }}
# Gitlab
GITLAB_ACCESS_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
# Gitbook # Gitbook
GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }} GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }}
GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }} GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}

View File

@@ -6,6 +6,7 @@ from collections.abc import Iterator
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from typing import Any from typing import Any
from typing import TypeVar
import gitlab import gitlab
import pytz import pytz
@@ -24,6 +25,8 @@ from onyx.connectors.models import Document
from onyx.connectors.models import TextSection from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger from onyx.utils.logger import setup_logger
T = TypeVar("T")
logger = setup_logger() logger = setup_logger()
@@ -36,9 +39,7 @@ exclude_patterns = [
] ]
def _batch_gitlab_objects( def _batch_gitlab_objects(git_objs: Iterable[T], batch_size: int) -> Iterator[list[T]]:
git_objs: Iterable[Any], batch_size: int
) -> Iterator[list[Any]]:
it = iter(git_objs) it = iter(git_objs)
while True: while True:
batch = list(itertools.islice(it, batch_size)) batch = list(itertools.islice(it, batch_size))
@@ -154,7 +155,7 @@ class GitlabConnector(LoadConnector, PollConnector):
) -> GenerateDocumentsOutput: ) -> GenerateDocumentsOutput:
if self.gitlab_client is None: if self.gitlab_client is None:
raise ConnectorMissingCredentialError("Gitlab") raise ConnectorMissingCredentialError("Gitlab")
project: gitlab.Project = self.gitlab_client.projects.get( project: Project = self.gitlab_client.projects.get(
f"{self.project_owner}/{self.project_name}" f"{self.project_owner}/{self.project_name}"
) )
@@ -189,7 +190,10 @@ class GitlabConnector(LoadConnector, PollConnector):
if self.include_mrs: if self.include_mrs:
merge_requests = project.mergerequests.list( merge_requests = project.mergerequests.list(
state=self.state_filter, order_by="updated_at", sort="desc" state=self.state_filter,
order_by="updated_at",
sort="desc",
iterator=True,
) )
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size): for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
@@ -209,7 +213,7 @@ class GitlabConnector(LoadConnector, PollConnector):
yield mr_doc_batch yield mr_doc_batch
if self.include_issues: if self.include_issues:
issues = project.issues.list(state=self.state_filter) issues = project.issues.list(state=self.state_filter, iterator=True)
for issue_batch in _batch_gitlab_objects(issues, self.batch_size): for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
issue_doc_batch: list[Document] = [] issue_doc_batch: list[Document] = []
@@ -235,8 +239,8 @@ class GitlabConnector(LoadConnector, PollConnector):
def poll_source( def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput: ) -> GenerateDocumentsOutput:
start_datetime = datetime.utcfromtimestamp(start) start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.utcfromtimestamp(end) end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
return self._fetch_from_gitlab(start_datetime, end_datetime) return self._fetch_from_gitlab(start_datetime, end_datetime)

View File

@@ -60,7 +60,7 @@ pycryptodome==3.19.1
pydantic==2.8.2 pydantic==2.8.2
PyGithub==2.5.0 PyGithub==2.5.0
python-dateutil==2.8.2 python-dateutil==2.8.2
python-gitlab==3.9.0 python-gitlab==5.6.0
python-pptx==0.6.23 python-pptx==0.6.23
pypdf==5.4.0 pypdf==5.4.0
pytest-mock==3.12.0 pytest-mock==3.12.0

View File

@@ -2,6 +2,7 @@ black==25.1.0
boto3-stubs[s3]==1.34.133 boto3-stubs[s3]==1.34.133
celery-types==0.19.0 celery-types==0.19.0
cohere==5.6.1 cohere==5.6.1
faker==37.1.0
lxml==5.3.0 lxml==5.3.0
lxml_html_clean==0.2.2 lxml_html_clean==0.2.2
mypy-extensions==1.0.0 mypy-extensions==1.0.0

View File

@@ -0,0 +1,135 @@
import itertools
import os
import pytest
from onyx.configs.constants import DocumentSource
from onyx.connectors.gitlab.connector import GitlabConnector
@pytest.fixture
def gitlab_connector() -> GitlabConnector:
connector = GitlabConnector(
project_owner="onyx2895818",
project_name="onyx",
include_mrs=True,
include_issues=True,
include_code_files=True, # Include code files in the test
)
# Ensure GITLAB_ACCESS_TOKEN and optionally GITLAB_URL are set in the environment
gitlab_url = os.environ.get("GITLAB_URL", "https://gitlab.com")
gitlab_token = os.environ.get("GITLAB_ACCESS_TOKEN")
if not gitlab_token:
pytest.skip("GITLAB_ACCESS_TOKEN environment variable not set.")
connector.load_credentials(
{
"gitlab_access_token": gitlab_token,
"gitlab_url": gitlab_url,
}
)
return connector
def test_gitlab_connector_basic(gitlab_connector: GitlabConnector) -> None:
doc_batches = gitlab_connector.load_from_state()
docs = list(itertools.chain(*doc_batches))
# Assert right number of docs - Adjust if necessary based on test repo state
assert len(docs) == 79
# Find one of each type to validate
validated_mr = False
validated_issue = False
validated_code_file = False
gitlab_base_url = os.environ.get("GITLAB_URL", "https://gitlab.com").split("//")[-1]
project_path = f"{gitlab_connector.project_owner}/{gitlab_connector.project_name}"
# --- Specific Document Details to Validate ---
target_mr_id = f"https://{gitlab_base_url}/{project_path}/-/merge_requests/1"
target_issue_id = f"https://{gitlab_base_url}/{project_path}/-/issues/2"
target_code_file_semantic_id = "README.md"
# ---
for doc in docs:
# Verify basic document properties (common to all types)
assert doc.source == DocumentSource.GITLAB
assert doc.secondary_owners is None
assert doc.from_ingestion_api is False
assert doc.additional_info is None
assert isinstance(doc.id, str)
assert doc.metadata is not None
assert "type" in doc.metadata
doc_type = doc.metadata["type"]
# Verify sections (common structure)
assert len(doc.sections) >= 1
section = doc.sections[0]
assert isinstance(section.link, str)
assert gitlab_base_url in section.link
assert isinstance(section.text, str)
# --- Type-specific and Content Validation ---
if doc.id == target_mr_id and doc_type == "MergeRequest":
assert doc.metadata["state"] == "opened"
assert doc.semantic_identifier == "Add awesome feature"
assert section.text == "This MR implements the awesome feature"
assert doc.primary_owners is not None
assert len(doc.primary_owners) == 1
assert (
doc.primary_owners[0].display_name == "Test"
) # Adjust if author changes
assert doc.id == section.link
validated_mr = True
elif doc.id == target_issue_id and doc_type == "ISSUE":
assert doc.metadata["state"] == "opened"
assert doc.semantic_identifier == "Investigate performance issue"
assert (
section.text
== "Investigate and resolve the performance degradation on endpoint X"
)
assert doc.primary_owners is not None
assert len(doc.primary_owners) == 1
assert (
doc.primary_owners[0].display_name == "Test"
) # Adjust if author changes
assert doc.id == section.link
validated_issue = True
elif (
doc.semantic_identifier == target_code_file_semantic_id
and doc_type == "CodeFile"
):
# ID is a git hash (e.g., 'd177...'), Link is the blob URL
assert doc.id != section.link
assert section.link.endswith("/README.md")
assert "# onyx" in section.text # Check for a known part of the content
# Code files might not have primary owners assigned this way
# assert len(doc.primary_owners) == 0
validated_code_file = True
# Generic validation for *any* document of the type if specific one not found yet
elif doc_type == "MergeRequest" and not validated_mr:
assert "state" in doc.metadata
assert gitlab_base_url in doc.id # MR ID should be a URL
assert doc.id == section.link # Link and ID are the same URL
elif doc_type == "ISSUE" and not validated_issue:
assert "state" in doc.metadata
assert gitlab_base_url in doc.id # Issue ID should be a URL
assert doc.id == section.link # Link and ID are the same URL
elif doc_type == "CodeFile" and not validated_code_file:
assert doc.id != section.link # ID is GID/hash, link is blob URL
# Early exit optimization (optional)
# if validated_mr and validated_issue and validated_code_file:
# break
# Assert that we found and validated the specific documents
assert (
validated_mr
), f"Failed to find and validate the specific MergeRequest ({target_mr_id})."
assert (
validated_issue
), f"Failed to find and validate the specific Issue ({target_issue_id})."
assert (
validated_code_file
), f"Failed to find and validate the specific CodeFile ({target_code_file_semantic_id})."

View File

@@ -253,24 +253,25 @@ export const connectorConfigs: Record<
name: "project_name", name: "project_name",
optional: false, optional: false,
}, },
],
advanced_values: [
{ {
type: "checkbox", type: "checkbox",
query: "Include merge requests?", query: "Include merge requests?",
label: "Include MRs", label: "Include MRs",
name: "include_mrs", name: "include_mrs",
description: "Index merge requests from repositories",
default: true, default: true,
hidden: true,
}, },
{ {
type: "checkbox", type: "checkbox",
query: "Include issues?", query: "Include issues?",
label: "Include Issues", label: "Include Issues",
name: "include_issues", name: "include_issues",
optional: true, description: "Index issues from repositories",
hidden: true, default: true,
}, },
], ],
advanced_values: [],
}, },
gitbook: { gitbook: {
description: "Configure GitBook connector", description: "Configure GitBook connector",