mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-27 13:23:28 +02:00
@@ -55,6 +55,8 @@ env:
|
|||||||
SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
|
SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
|
||||||
# Github
|
# Github
|
||||||
ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }}
|
ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }}
|
||||||
|
# Gitlab
|
||||||
|
GITLAB_ACCESS_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
|
||||||
# Gitbook
|
# Gitbook
|
||||||
GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }}
|
GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }}
|
||||||
GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}
|
GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}
|
||||||
|
@@ -6,6 +6,7 @@ from collections.abc import Iterator
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from typing import TypeVar
|
||||||
|
|
||||||
import gitlab
|
import gitlab
|
||||||
import pytz
|
import pytz
|
||||||
@@ -24,6 +25,8 @@ from onyx.connectors.models import Document
|
|||||||
from onyx.connectors.models import TextSection
|
from onyx.connectors.models import TextSection
|
||||||
from onyx.utils.logger import setup_logger
|
from onyx.utils.logger import setup_logger
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
@@ -36,9 +39,7 @@ exclude_patterns = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def _batch_gitlab_objects(
|
def _batch_gitlab_objects(git_objs: Iterable[T], batch_size: int) -> Iterator[list[T]]:
|
||||||
git_objs: Iterable[Any], batch_size: int
|
|
||||||
) -> Iterator[list[Any]]:
|
|
||||||
it = iter(git_objs)
|
it = iter(git_objs)
|
||||||
while True:
|
while True:
|
||||||
batch = list(itertools.islice(it, batch_size))
|
batch = list(itertools.islice(it, batch_size))
|
||||||
@@ -154,7 +155,7 @@ class GitlabConnector(LoadConnector, PollConnector):
|
|||||||
) -> GenerateDocumentsOutput:
|
) -> GenerateDocumentsOutput:
|
||||||
if self.gitlab_client is None:
|
if self.gitlab_client is None:
|
||||||
raise ConnectorMissingCredentialError("Gitlab")
|
raise ConnectorMissingCredentialError("Gitlab")
|
||||||
project: gitlab.Project = self.gitlab_client.projects.get(
|
project: Project = self.gitlab_client.projects.get(
|
||||||
f"{self.project_owner}/{self.project_name}"
|
f"{self.project_owner}/{self.project_name}"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -189,7 +190,10 @@ class GitlabConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
if self.include_mrs:
|
if self.include_mrs:
|
||||||
merge_requests = project.mergerequests.list(
|
merge_requests = project.mergerequests.list(
|
||||||
state=self.state_filter, order_by="updated_at", sort="desc"
|
state=self.state_filter,
|
||||||
|
order_by="updated_at",
|
||||||
|
sort="desc",
|
||||||
|
iterator=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
|
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
|
||||||
@@ -209,7 +213,7 @@ class GitlabConnector(LoadConnector, PollConnector):
|
|||||||
yield mr_doc_batch
|
yield mr_doc_batch
|
||||||
|
|
||||||
if self.include_issues:
|
if self.include_issues:
|
||||||
issues = project.issues.list(state=self.state_filter)
|
issues = project.issues.list(state=self.state_filter, iterator=True)
|
||||||
|
|
||||||
for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
|
for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
|
||||||
issue_doc_batch: list[Document] = []
|
issue_doc_batch: list[Document] = []
|
||||||
@@ -235,8 +239,8 @@ class GitlabConnector(LoadConnector, PollConnector):
|
|||||||
def poll_source(
|
def poll_source(
|
||||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||||
) -> GenerateDocumentsOutput:
|
) -> GenerateDocumentsOutput:
|
||||||
start_datetime = datetime.utcfromtimestamp(start)
|
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
|
||||||
end_datetime = datetime.utcfromtimestamp(end)
|
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
|
||||||
return self._fetch_from_gitlab(start_datetime, end_datetime)
|
return self._fetch_from_gitlab(start_datetime, end_datetime)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -60,7 +60,7 @@ pycryptodome==3.19.1
|
|||||||
pydantic==2.8.2
|
pydantic==2.8.2
|
||||||
PyGithub==2.5.0
|
PyGithub==2.5.0
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
python-gitlab==3.9.0
|
python-gitlab==5.6.0
|
||||||
python-pptx==0.6.23
|
python-pptx==0.6.23
|
||||||
pypdf==5.4.0
|
pypdf==5.4.0
|
||||||
pytest-mock==3.12.0
|
pytest-mock==3.12.0
|
||||||
|
@@ -2,6 +2,7 @@ black==25.1.0
|
|||||||
boto3-stubs[s3]==1.34.133
|
boto3-stubs[s3]==1.34.133
|
||||||
celery-types==0.19.0
|
celery-types==0.19.0
|
||||||
cohere==5.6.1
|
cohere==5.6.1
|
||||||
|
faker==37.1.0
|
||||||
lxml==5.3.0
|
lxml==5.3.0
|
||||||
lxml_html_clean==0.2.2
|
lxml_html_clean==0.2.2
|
||||||
mypy-extensions==1.0.0
|
mypy-extensions==1.0.0
|
||||||
|
135
backend/tests/daily/connectors/gitlab/test_gitlab_basic.py
Normal file
135
backend/tests/daily/connectors/gitlab/test_gitlab_basic.py
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
import itertools
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from onyx.configs.constants import DocumentSource
|
||||||
|
from onyx.connectors.gitlab.connector import GitlabConnector
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def gitlab_connector() -> GitlabConnector:
|
||||||
|
connector = GitlabConnector(
|
||||||
|
project_owner="onyx2895818",
|
||||||
|
project_name="onyx",
|
||||||
|
include_mrs=True,
|
||||||
|
include_issues=True,
|
||||||
|
include_code_files=True, # Include code files in the test
|
||||||
|
)
|
||||||
|
# Ensure GITLAB_ACCESS_TOKEN and optionally GITLAB_URL are set in the environment
|
||||||
|
gitlab_url = os.environ.get("GITLAB_URL", "https://gitlab.com")
|
||||||
|
gitlab_token = os.environ.get("GITLAB_ACCESS_TOKEN")
|
||||||
|
|
||||||
|
if not gitlab_token:
|
||||||
|
pytest.skip("GITLAB_ACCESS_TOKEN environment variable not set.")
|
||||||
|
|
||||||
|
connector.load_credentials(
|
||||||
|
{
|
||||||
|
"gitlab_access_token": gitlab_token,
|
||||||
|
"gitlab_url": gitlab_url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return connector
|
||||||
|
|
||||||
|
|
||||||
|
def test_gitlab_connector_basic(gitlab_connector: GitlabConnector) -> None:
|
||||||
|
doc_batches = gitlab_connector.load_from_state()
|
||||||
|
docs = list(itertools.chain(*doc_batches))
|
||||||
|
# Assert right number of docs - Adjust if necessary based on test repo state
|
||||||
|
assert len(docs) == 79
|
||||||
|
|
||||||
|
# Find one of each type to validate
|
||||||
|
validated_mr = False
|
||||||
|
validated_issue = False
|
||||||
|
validated_code_file = False
|
||||||
|
gitlab_base_url = os.environ.get("GITLAB_URL", "https://gitlab.com").split("//")[-1]
|
||||||
|
project_path = f"{gitlab_connector.project_owner}/{gitlab_connector.project_name}"
|
||||||
|
|
||||||
|
# --- Specific Document Details to Validate ---
|
||||||
|
target_mr_id = f"https://{gitlab_base_url}/{project_path}/-/merge_requests/1"
|
||||||
|
target_issue_id = f"https://{gitlab_base_url}/{project_path}/-/issues/2"
|
||||||
|
target_code_file_semantic_id = "README.md"
|
||||||
|
# ---
|
||||||
|
|
||||||
|
for doc in docs:
|
||||||
|
# Verify basic document properties (common to all types)
|
||||||
|
assert doc.source == DocumentSource.GITLAB
|
||||||
|
assert doc.secondary_owners is None
|
||||||
|
assert doc.from_ingestion_api is False
|
||||||
|
assert doc.additional_info is None
|
||||||
|
assert isinstance(doc.id, str)
|
||||||
|
assert doc.metadata is not None
|
||||||
|
assert "type" in doc.metadata
|
||||||
|
doc_type = doc.metadata["type"]
|
||||||
|
|
||||||
|
# Verify sections (common structure)
|
||||||
|
assert len(doc.sections) >= 1
|
||||||
|
section = doc.sections[0]
|
||||||
|
assert isinstance(section.link, str)
|
||||||
|
assert gitlab_base_url in section.link
|
||||||
|
assert isinstance(section.text, str)
|
||||||
|
|
||||||
|
# --- Type-specific and Content Validation ---
|
||||||
|
if doc.id == target_mr_id and doc_type == "MergeRequest":
|
||||||
|
assert doc.metadata["state"] == "opened"
|
||||||
|
assert doc.semantic_identifier == "Add awesome feature"
|
||||||
|
assert section.text == "This MR implements the awesome feature"
|
||||||
|
assert doc.primary_owners is not None
|
||||||
|
assert len(doc.primary_owners) == 1
|
||||||
|
assert (
|
||||||
|
doc.primary_owners[0].display_name == "Test"
|
||||||
|
) # Adjust if author changes
|
||||||
|
assert doc.id == section.link
|
||||||
|
validated_mr = True
|
||||||
|
elif doc.id == target_issue_id and doc_type == "ISSUE":
|
||||||
|
assert doc.metadata["state"] == "opened"
|
||||||
|
assert doc.semantic_identifier == "Investigate performance issue"
|
||||||
|
assert (
|
||||||
|
section.text
|
||||||
|
== "Investigate and resolve the performance degradation on endpoint X"
|
||||||
|
)
|
||||||
|
assert doc.primary_owners is not None
|
||||||
|
assert len(doc.primary_owners) == 1
|
||||||
|
assert (
|
||||||
|
doc.primary_owners[0].display_name == "Test"
|
||||||
|
) # Adjust if author changes
|
||||||
|
assert doc.id == section.link
|
||||||
|
validated_issue = True
|
||||||
|
elif (
|
||||||
|
doc.semantic_identifier == target_code_file_semantic_id
|
||||||
|
and doc_type == "CodeFile"
|
||||||
|
):
|
||||||
|
# ID is a git hash (e.g., 'd177...'), Link is the blob URL
|
||||||
|
assert doc.id != section.link
|
||||||
|
assert section.link.endswith("/README.md")
|
||||||
|
assert "# onyx" in section.text # Check for a known part of the content
|
||||||
|
# Code files might not have primary owners assigned this way
|
||||||
|
# assert len(doc.primary_owners) == 0
|
||||||
|
validated_code_file = True
|
||||||
|
|
||||||
|
# Generic validation for *any* document of the type if specific one not found yet
|
||||||
|
elif doc_type == "MergeRequest" and not validated_mr:
|
||||||
|
assert "state" in doc.metadata
|
||||||
|
assert gitlab_base_url in doc.id # MR ID should be a URL
|
||||||
|
assert doc.id == section.link # Link and ID are the same URL
|
||||||
|
elif doc_type == "ISSUE" and not validated_issue:
|
||||||
|
assert "state" in doc.metadata
|
||||||
|
assert gitlab_base_url in doc.id # Issue ID should be a URL
|
||||||
|
assert doc.id == section.link # Link and ID are the same URL
|
||||||
|
elif doc_type == "CodeFile" and not validated_code_file:
|
||||||
|
assert doc.id != section.link # ID is GID/hash, link is blob URL
|
||||||
|
|
||||||
|
# Early exit optimization (optional)
|
||||||
|
# if validated_mr and validated_issue and validated_code_file:
|
||||||
|
# break
|
||||||
|
|
||||||
|
# Assert that we found and validated the specific documents
|
||||||
|
assert (
|
||||||
|
validated_mr
|
||||||
|
), f"Failed to find and validate the specific MergeRequest ({target_mr_id})."
|
||||||
|
assert (
|
||||||
|
validated_issue
|
||||||
|
), f"Failed to find and validate the specific Issue ({target_issue_id})."
|
||||||
|
assert (
|
||||||
|
validated_code_file
|
||||||
|
), f"Failed to find and validate the specific CodeFile ({target_code_file_semantic_id})."
|
@@ -253,24 +253,25 @@ export const connectorConfigs: Record<
|
|||||||
name: "project_name",
|
name: "project_name",
|
||||||
optional: false,
|
optional: false,
|
||||||
},
|
},
|
||||||
|
],
|
||||||
|
advanced_values: [
|
||||||
{
|
{
|
||||||
type: "checkbox",
|
type: "checkbox",
|
||||||
query: "Include merge requests?",
|
query: "Include merge requests?",
|
||||||
label: "Include MRs",
|
label: "Include MRs",
|
||||||
name: "include_mrs",
|
name: "include_mrs",
|
||||||
|
description: "Index merge requests from repositories",
|
||||||
default: true,
|
default: true,
|
||||||
hidden: true,
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "checkbox",
|
type: "checkbox",
|
||||||
query: "Include issues?",
|
query: "Include issues?",
|
||||||
label: "Include Issues",
|
label: "Include Issues",
|
||||||
name: "include_issues",
|
name: "include_issues",
|
||||||
optional: true,
|
description: "Index issues from repositories",
|
||||||
hidden: true,
|
default: true,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
advanced_values: [],
|
|
||||||
},
|
},
|
||||||
gitbook: {
|
gitbook: {
|
||||||
description: "Configure GitBook connector",
|
description: "Configure GitBook connector",
|
||||||
|
Reference in New Issue
Block a user