Fix gitlab (#4629)

* Fix gitlab

* Add back assert
This commit is contained in:
Chris Weaver
2025-04-28 17:42:49 -07:00
committed by GitHub
parent 8961a3cc72
commit 9be3da2357
6 changed files with 156 additions and 13 deletions

View File

@@ -55,6 +55,8 @@ env:
SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
# Github
ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }}
# Gitlab
GITLAB_ACCESS_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
# Gitbook
GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }}
GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}

View File

@@ -6,6 +6,7 @@ from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import TypeVar
import gitlab
import pytz
@@ -24,6 +25,8 @@ from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
T = TypeVar("T")
logger = setup_logger()
@@ -36,9 +39,7 @@ exclude_patterns = [
]
def _batch_gitlab_objects(
git_objs: Iterable[Any], batch_size: int
) -> Iterator[list[Any]]:
def _batch_gitlab_objects(git_objs: Iterable[T], batch_size: int) -> Iterator[list[T]]:
it = iter(git_objs)
while True:
batch = list(itertools.islice(it, batch_size))
@@ -154,7 +155,7 @@ class GitlabConnector(LoadConnector, PollConnector):
) -> GenerateDocumentsOutput:
if self.gitlab_client is None:
raise ConnectorMissingCredentialError("Gitlab")
project: gitlab.Project = self.gitlab_client.projects.get(
project: Project = self.gitlab_client.projects.get(
f"{self.project_owner}/{self.project_name}"
)
@@ -189,7 +190,10 @@ class GitlabConnector(LoadConnector, PollConnector):
if self.include_mrs:
merge_requests = project.mergerequests.list(
state=self.state_filter, order_by="updated_at", sort="desc"
state=self.state_filter,
order_by="updated_at",
sort="desc",
iterator=True,
)
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
@@ -209,7 +213,7 @@ class GitlabConnector(LoadConnector, PollConnector):
yield mr_doc_batch
if self.include_issues:
issues = project.issues.list(state=self.state_filter)
issues = project.issues.list(state=self.state_filter, iterator=True)
for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
issue_doc_batch: list[Document] = []
@@ -235,8 +239,8 @@ class GitlabConnector(LoadConnector, PollConnector):
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
start_datetime = datetime.utcfromtimestamp(start)
end_datetime = datetime.utcfromtimestamp(end)
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
return self._fetch_from_gitlab(start_datetime, end_datetime)

View File

@@ -60,7 +60,7 @@ pycryptodome==3.19.1
pydantic==2.8.2
PyGithub==2.5.0
python-dateutil==2.8.2
python-gitlab==3.9.0
python-gitlab==5.6.0
python-pptx==0.6.23
pypdf==5.4.0
pytest-mock==3.12.0

View File

@@ -2,6 +2,7 @@ black==25.1.0
boto3-stubs[s3]==1.34.133
celery-types==0.19.0
cohere==5.6.1
faker==37.1.0
lxml==5.3.0
lxml_html_clean==0.2.2
mypy-extensions==1.0.0

View File

@@ -0,0 +1,135 @@
import itertools
import os
import pytest
from onyx.configs.constants import DocumentSource
from onyx.connectors.gitlab.connector import GitlabConnector
@pytest.fixture
def gitlab_connector() -> GitlabConnector:
connector = GitlabConnector(
project_owner="onyx2895818",
project_name="onyx",
include_mrs=True,
include_issues=True,
include_code_files=True, # Include code files in the test
)
# Ensure GITLAB_ACCESS_TOKEN and optionally GITLAB_URL are set in the environment
gitlab_url = os.environ.get("GITLAB_URL", "https://gitlab.com")
gitlab_token = os.environ.get("GITLAB_ACCESS_TOKEN")
if not gitlab_token:
pytest.skip("GITLAB_ACCESS_TOKEN environment variable not set.")
connector.load_credentials(
{
"gitlab_access_token": gitlab_token,
"gitlab_url": gitlab_url,
}
)
return connector
def test_gitlab_connector_basic(gitlab_connector: GitlabConnector) -> None:
doc_batches = gitlab_connector.load_from_state()
docs = list(itertools.chain(*doc_batches))
# Assert right number of docs - Adjust if necessary based on test repo state
assert len(docs) == 79
# Find one of each type to validate
validated_mr = False
validated_issue = False
validated_code_file = False
gitlab_base_url = os.environ.get("GITLAB_URL", "https://gitlab.com").split("//")[-1]
project_path = f"{gitlab_connector.project_owner}/{gitlab_connector.project_name}"
# --- Specific Document Details to Validate ---
target_mr_id = f"https://{gitlab_base_url}/{project_path}/-/merge_requests/1"
target_issue_id = f"https://{gitlab_base_url}/{project_path}/-/issues/2"
target_code_file_semantic_id = "README.md"
# ---
for doc in docs:
# Verify basic document properties (common to all types)
assert doc.source == DocumentSource.GITLAB
assert doc.secondary_owners is None
assert doc.from_ingestion_api is False
assert doc.additional_info is None
assert isinstance(doc.id, str)
assert doc.metadata is not None
assert "type" in doc.metadata
doc_type = doc.metadata["type"]
# Verify sections (common structure)
assert len(doc.sections) >= 1
section = doc.sections[0]
assert isinstance(section.link, str)
assert gitlab_base_url in section.link
assert isinstance(section.text, str)
# --- Type-specific and Content Validation ---
if doc.id == target_mr_id and doc_type == "MergeRequest":
assert doc.metadata["state"] == "opened"
assert doc.semantic_identifier == "Add awesome feature"
assert section.text == "This MR implements the awesome feature"
assert doc.primary_owners is not None
assert len(doc.primary_owners) == 1
assert (
doc.primary_owners[0].display_name == "Test"
) # Adjust if author changes
assert doc.id == section.link
validated_mr = True
elif doc.id == target_issue_id and doc_type == "ISSUE":
assert doc.metadata["state"] == "opened"
assert doc.semantic_identifier == "Investigate performance issue"
assert (
section.text
== "Investigate and resolve the performance degradation on endpoint X"
)
assert doc.primary_owners is not None
assert len(doc.primary_owners) == 1
assert (
doc.primary_owners[0].display_name == "Test"
) # Adjust if author changes
assert doc.id == section.link
validated_issue = True
elif (
doc.semantic_identifier == target_code_file_semantic_id
and doc_type == "CodeFile"
):
# ID is a git hash (e.g., 'd177...'), Link is the blob URL
assert doc.id != section.link
assert section.link.endswith("/README.md")
assert "# onyx" in section.text # Check for a known part of the content
# Code files might not have primary owners assigned this way
# assert len(doc.primary_owners) == 0
validated_code_file = True
# Generic validation for *any* document of the type if specific one not found yet
elif doc_type == "MergeRequest" and not validated_mr:
assert "state" in doc.metadata
assert gitlab_base_url in doc.id # MR ID should be a URL
assert doc.id == section.link # Link and ID are the same URL
elif doc_type == "ISSUE" and not validated_issue:
assert "state" in doc.metadata
assert gitlab_base_url in doc.id # Issue ID should be a URL
assert doc.id == section.link # Link and ID are the same URL
elif doc_type == "CodeFile" and not validated_code_file:
assert doc.id != section.link # ID is GID/hash, link is blob URL
# Early exit optimization (optional)
# if validated_mr and validated_issue and validated_code_file:
# break
# Assert that we found and validated the specific documents
assert (
validated_mr
), f"Failed to find and validate the specific MergeRequest ({target_mr_id})."
assert (
validated_issue
), f"Failed to find and validate the specific Issue ({target_issue_id})."
assert (
validated_code_file
), f"Failed to find and validate the specific CodeFile ({target_code_file_semantic_id})."

View File

@@ -253,24 +253,25 @@ export const connectorConfigs: Record<
name: "project_name",
optional: false,
},
],
advanced_values: [
{
type: "checkbox",
query: "Include merge requests?",
label: "Include MRs",
name: "include_mrs",
description: "Index merge requests from repositories",
default: true,
hidden: true,
},
{
type: "checkbox",
query: "Include issues?",
label: "Include Issues",
name: "include_issues",
optional: true,
hidden: true,
description: "Index issues from repositories",
default: true,
},
],
advanced_values: [],
},
gitbook: {
description: "Configure GitBook connector",