mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-12 09:00:53 +02:00
checkpointed confluence (#4473)
* checkpointed confluence * confluence checkpointing tested * fixed integration tests * attempt to fix connector test flakiness * fix rebase
This commit is contained in:
parent
742041d97a
commit
ae9f8c3071
@ -7,7 +7,7 @@ from sqlalchemy.orm import Session
|
|||||||
|
|
||||||
from onyx.configs.constants import FileOrigin
|
from onyx.configs.constants import FileOrigin
|
||||||
from onyx.connectors.interfaces import BaseConnector
|
from onyx.connectors.interfaces import BaseConnector
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.models import ConnectorCheckpoint
|
from onyx.connectors.models import ConnectorCheckpoint
|
||||||
from onyx.db.engine import get_db_current_time
|
from onyx.db.engine import get_db_current_time
|
||||||
from onyx.db.index_attempt import get_index_attempt
|
from onyx.db.index_attempt import get_index_attempt
|
||||||
@ -61,7 +61,7 @@ def load_checkpoint(
|
|||||||
try:
|
try:
|
||||||
checkpoint_io = file_store.read_file(checkpoint_pointer, mode="rb")
|
checkpoint_io = file_store.read_file(checkpoint_pointer, mode="rb")
|
||||||
checkpoint_data = checkpoint_io.read().decode("utf-8")
|
checkpoint_data = checkpoint_io.read().decode("utf-8")
|
||||||
if isinstance(connector, CheckpointConnector):
|
if isinstance(connector, CheckpointedConnector):
|
||||||
return connector.validate_checkpoint_json(checkpoint_data)
|
return connector.validate_checkpoint_json(checkpoint_data)
|
||||||
return ConnectorCheckpoint.model_validate_json(checkpoint_data)
|
return ConnectorCheckpoint.model_validate_json(checkpoint_data)
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import copy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
@ -5,6 +6,7 @@ from typing import Any
|
|||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
|
from typing_extensions import override
|
||||||
|
|
||||||
from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
||||||
from onyx.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
|
from onyx.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
|
||||||
@ -22,17 +24,19 @@ from onyx.connectors.exceptions import ConnectorValidationError
|
|||||||
from onyx.connectors.exceptions import CredentialExpiredError
|
from onyx.connectors.exceptions import CredentialExpiredError
|
||||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||||
from onyx.connectors.exceptions import UnexpectedValidationError
|
from onyx.connectors.exceptions import UnexpectedValidationError
|
||||||
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
|
from onyx.connectors.interfaces import ConnectorCheckpoint
|
||||||
|
from onyx.connectors.interfaces import ConnectorFailure
|
||||||
from onyx.connectors.interfaces import CredentialsConnector
|
from onyx.connectors.interfaces import CredentialsConnector
|
||||||
from onyx.connectors.interfaces import CredentialsProviderInterface
|
from onyx.connectors.interfaces import CredentialsProviderInterface
|
||||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
|
||||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||||
from onyx.connectors.interfaces import LoadConnector
|
|
||||||
from onyx.connectors.interfaces import PollConnector
|
|
||||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
from onyx.connectors.interfaces import SlimConnector
|
from onyx.connectors.interfaces import SlimConnector
|
||||||
from onyx.connectors.models import BasicExpertInfo
|
from onyx.connectors.models import BasicExpertInfo
|
||||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||||
from onyx.connectors.models import Document
|
from onyx.connectors.models import Document
|
||||||
|
from onyx.connectors.models import DocumentFailure
|
||||||
from onyx.connectors.models import ImageSection
|
from onyx.connectors.models import ImageSection
|
||||||
from onyx.connectors.models import SlimDocument
|
from onyx.connectors.models import SlimDocument
|
||||||
from onyx.connectors.models import TextSection
|
from onyx.connectors.models import TextSection
|
||||||
@ -68,9 +72,16 @@ _SLIM_DOC_BATCH_SIZE = 5000
|
|||||||
ONE_HOUR = 3600
|
ONE_HOUR = 3600
|
||||||
|
|
||||||
|
|
||||||
|
def _should_propagate_error(e: Exception) -> bool:
|
||||||
|
return "field 'updated' is invalid" in str(e)
|
||||||
|
|
||||||
|
|
||||||
|
class ConfluenceCheckpoint(ConnectorCheckpoint):
|
||||||
|
last_updated: SecondsSinceUnixEpoch
|
||||||
|
|
||||||
|
|
||||||
class ConfluenceConnector(
|
class ConfluenceConnector(
|
||||||
LoadConnector,
|
CheckpointedConnector[ConfluenceCheckpoint],
|
||||||
PollConnector,
|
|
||||||
SlimConnector,
|
SlimConnector,
|
||||||
CredentialsConnector,
|
CredentialsConnector,
|
||||||
):
|
):
|
||||||
@ -211,6 +222,8 @@ class ConfluenceConnector(
|
|||||||
"%Y-%m-%d %H:%M"
|
"%Y-%m-%d %H:%M"
|
||||||
)
|
)
|
||||||
page_query += f" and lastmodified <= '{formatted_end_time}'"
|
page_query += f" and lastmodified <= '{formatted_end_time}'"
|
||||||
|
|
||||||
|
page_query += " order by lastmodified asc"
|
||||||
return page_query
|
return page_query
|
||||||
|
|
||||||
def _construct_attachment_query(self, confluence_page_id: str) -> str:
|
def _construct_attachment_query(self, confluence_page_id: str) -> str:
|
||||||
@ -236,11 +249,14 @@ class ConfluenceConnector(
|
|||||||
)
|
)
|
||||||
return comment_string
|
return comment_string
|
||||||
|
|
||||||
def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
|
def _convert_page_to_document(
|
||||||
|
self, page: dict[str, Any]
|
||||||
|
) -> Document | ConnectorFailure:
|
||||||
"""
|
"""
|
||||||
Converts a Confluence page to a Document object.
|
Converts a Confluence page to a Document object.
|
||||||
Includes the page content, comments, and attachments.
|
Includes the page content, comments, and attachments.
|
||||||
"""
|
"""
|
||||||
|
page_id = page_url = ""
|
||||||
try:
|
try:
|
||||||
# Extract basic page information
|
# Extract basic page information
|
||||||
page_id = page["id"]
|
page_id = page["id"]
|
||||||
@ -336,15 +352,90 @@ class ConfluenceConnector(
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error converting page {page.get('id', 'unknown')}: {e}")
|
logger.error(f"Error converting page {page.get('id', 'unknown')}: {e}")
|
||||||
if not self.continue_on_failure:
|
if _should_propagate_error(e):
|
||||||
raise
|
raise
|
||||||
return None
|
return ConnectorFailure(
|
||||||
|
failed_document=DocumentFailure(
|
||||||
|
document_id=page_id,
|
||||||
|
document_link=page_url,
|
||||||
|
),
|
||||||
|
failure_message=f"Error converting page {page.get('id', 'unknown')}: {e}",
|
||||||
|
exception=e,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fetch_page_attachments(
|
||||||
|
self, page: dict[str, Any], doc: Document
|
||||||
|
) -> Document | ConnectorFailure:
|
||||||
|
attachment_query = self._construct_attachment_query(page["id"])
|
||||||
|
|
||||||
|
for attachment in self.confluence_client.paginated_cql_retrieval(
|
||||||
|
cql=attachment_query,
|
||||||
|
expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
|
||||||
|
):
|
||||||
|
attachment["metadata"].get("mediaType", "")
|
||||||
|
if not validate_attachment_filetype(
|
||||||
|
attachment,
|
||||||
|
):
|
||||||
|
logger.info(f"Skipping attachment: {attachment['title']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Processing attachment: {attachment['title']}")
|
||||||
|
|
||||||
|
# Attempt to get textual content or image summarization:
|
||||||
|
object_url = build_confluence_document_id(
|
||||||
|
self.wiki_base, attachment["_links"]["webui"], self.is_cloud
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
response = convert_attachment_to_content(
|
||||||
|
confluence_client=self.confluence_client,
|
||||||
|
attachment=attachment,
|
||||||
|
page_id=page["id"],
|
||||||
|
allow_images=self.allow_images,
|
||||||
|
)
|
||||||
|
if response is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
content_text, file_storage_name = response
|
||||||
|
|
||||||
|
if content_text:
|
||||||
|
doc.sections.append(
|
||||||
|
TextSection(
|
||||||
|
text=content_text,
|
||||||
|
link=object_url,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif file_storage_name:
|
||||||
|
doc.sections.append(
|
||||||
|
ImageSection(
|
||||||
|
link=object_url,
|
||||||
|
image_file_name=file_storage_name,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to extract/summarize attachment {attachment['title']}",
|
||||||
|
exc_info=e,
|
||||||
|
)
|
||||||
|
if not self.continue_on_failure:
|
||||||
|
if _should_propagate_error(e):
|
||||||
|
raise
|
||||||
|
# TODO: should we remove continue_on_failure entirely now that we have checkpointing?
|
||||||
|
return ConnectorFailure(
|
||||||
|
failed_document=DocumentFailure(
|
||||||
|
document_id=doc.id,
|
||||||
|
document_link=object_url,
|
||||||
|
),
|
||||||
|
failure_message=f"Failed to extract/summarize attachment {attachment['title']} for doc {doc.id}",
|
||||||
|
exception=e,
|
||||||
|
)
|
||||||
|
return doc
|
||||||
|
|
||||||
def _fetch_document_batches(
|
def _fetch_document_batches(
|
||||||
self,
|
self,
|
||||||
|
checkpoint: ConfluenceCheckpoint,
|
||||||
start: SecondsSinceUnixEpoch | None = None,
|
start: SecondsSinceUnixEpoch | None = None,
|
||||||
end: SecondsSinceUnixEpoch | None = None,
|
end: SecondsSinceUnixEpoch | None = None,
|
||||||
) -> GenerateDocumentsOutput:
|
) -> CheckpointOutput[ConfluenceCheckpoint]:
|
||||||
"""
|
"""
|
||||||
Yields batches of Documents. For each page:
|
Yields batches of Documents. For each page:
|
||||||
- Create a Document with 1 Section for the page text/comments
|
- Create a Document with 1 Section for the page text/comments
|
||||||
@ -352,9 +443,12 @@ class ConfluenceConnector(
|
|||||||
- Attempt to convert it with convert_attachment_to_content(...)
|
- Attempt to convert it with convert_attachment_to_content(...)
|
||||||
- If successful, create a new Section with the extracted text or summary.
|
- If successful, create a new Section with the extracted text or summary.
|
||||||
"""
|
"""
|
||||||
doc_batch: list[Document] = []
|
doc_count = 0
|
||||||
|
|
||||||
page_query = self._construct_page_query(start, end)
|
checkpoint = copy.deepcopy(checkpoint)
|
||||||
|
|
||||||
|
# use "start" when last_updated is 0
|
||||||
|
page_query = self._construct_page_query(checkpoint.last_updated or start, end)
|
||||||
logger.debug(f"page_query: {page_query}")
|
logger.debug(f"page_query: {page_query}")
|
||||||
|
|
||||||
for page in self.confluence_client.paginated_cql_retrieval(
|
for page in self.confluence_client.paginated_cql_retrieval(
|
||||||
@ -363,94 +457,61 @@ class ConfluenceConnector(
|
|||||||
limit=self.batch_size,
|
limit=self.batch_size,
|
||||||
):
|
):
|
||||||
# Build doc from page
|
# Build doc from page
|
||||||
doc = self._convert_page_to_document(page)
|
doc_or_failure = self._convert_page_to_document(page)
|
||||||
if not doc:
|
|
||||||
|
if isinstance(doc_or_failure, ConnectorFailure):
|
||||||
|
yield doc_or_failure
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
checkpoint.last_updated = datetime_from_string(
|
||||||
|
page["version"]["when"]
|
||||||
|
).timestamp()
|
||||||
|
|
||||||
# Now get attachments for that page:
|
# Now get attachments for that page:
|
||||||
attachment_query = self._construct_attachment_query(page["id"])
|
doc_or_failure = self._fetch_page_attachments(page, doc_or_failure)
|
||||||
# We'll use the page's XML to provide context if we summarize an image
|
|
||||||
page.get("body", {}).get("storage", {}).get("value", "")
|
|
||||||
|
|
||||||
for attachment in self.confluence_client.paginated_cql_retrieval(
|
if isinstance(doc_or_failure, ConnectorFailure):
|
||||||
cql=attachment_query,
|
yield doc_or_failure
|
||||||
expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
|
continue
|
||||||
):
|
|
||||||
attachment["metadata"].get("mediaType", "")
|
|
||||||
if not validate_attachment_filetype(
|
|
||||||
attachment,
|
|
||||||
):
|
|
||||||
logger.info(f"Skipping attachment: {attachment['title']}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f"Processing attachment: {attachment['title']}")
|
# yield completed document
|
||||||
|
doc_count += 1
|
||||||
|
yield doc_or_failure
|
||||||
|
|
||||||
# Attempt to get textual content or image summarization:
|
# create checkpoint after enough documents have been processed
|
||||||
try:
|
if doc_count >= self.batch_size:
|
||||||
response = convert_attachment_to_content(
|
return checkpoint
|
||||||
confluence_client=self.confluence_client,
|
|
||||||
attachment=attachment,
|
|
||||||
page_id=page["id"],
|
|
||||||
allow_images=self.allow_images,
|
|
||||||
)
|
|
||||||
if response is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
content_text, file_storage_name = response
|
checkpoint.has_more = False
|
||||||
object_url = build_confluence_document_id(
|
return checkpoint
|
||||||
self.wiki_base, attachment["_links"]["webui"], self.is_cloud
|
|
||||||
)
|
|
||||||
if content_text:
|
|
||||||
doc.sections.append(
|
|
||||||
TextSection(
|
|
||||||
text=content_text,
|
|
||||||
link=object_url,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif file_storage_name:
|
|
||||||
doc.sections.append(
|
|
||||||
ImageSection(
|
|
||||||
link=object_url,
|
|
||||||
image_file_name=file_storage_name,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(
|
|
||||||
f"Failed to extract/summarize attachment {attachment['title']}",
|
|
||||||
exc_info=e,
|
|
||||||
)
|
|
||||||
if not self.continue_on_failure:
|
|
||||||
raise
|
|
||||||
|
|
||||||
doc_batch.append(doc)
|
@override
|
||||||
|
def load_from_checkpoint(
|
||||||
if len(doc_batch) >= self.batch_size:
|
|
||||||
yield doc_batch
|
|
||||||
doc_batch = []
|
|
||||||
|
|
||||||
if doc_batch:
|
|
||||||
yield doc_batch
|
|
||||||
|
|
||||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
|
||||||
return self._fetch_document_batches()
|
|
||||||
|
|
||||||
def poll_source(
|
|
||||||
self,
|
self,
|
||||||
start: SecondsSinceUnixEpoch | None = None,
|
start: SecondsSinceUnixEpoch,
|
||||||
end: SecondsSinceUnixEpoch | None = None,
|
end: SecondsSinceUnixEpoch,
|
||||||
) -> GenerateDocumentsOutput:
|
checkpoint: ConfluenceCheckpoint,
|
||||||
|
) -> CheckpointOutput[ConfluenceCheckpoint]:
|
||||||
try:
|
try:
|
||||||
return self._fetch_document_batches(start, end)
|
return self._fetch_document_batches(checkpoint, start, end)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if "field 'updated' is invalid" in str(e) and start is not None:
|
if _should_propagate_error(e) and start is not None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Confluence says we provided an invalid 'updated' field. This may indicate"
|
"Confluence says we provided an invalid 'updated' field. This may indicate"
|
||||||
"a real issue, but can also appear during edge cases like daylight"
|
"a real issue, but can also appear during edge cases like daylight"
|
||||||
f"savings time changes. Retrying with a 1 hour offset. Error: {e}"
|
f"savings time changes. Retrying with a 1 hour offset. Error: {e}"
|
||||||
)
|
)
|
||||||
return self._fetch_document_batches(start - ONE_HOUR, end)
|
return self._fetch_document_batches(checkpoint, start - ONE_HOUR, end)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
@override
|
||||||
|
def build_dummy_checkpoint(self) -> ConfluenceCheckpoint:
|
||||||
|
return ConfluenceCheckpoint(last_updated=0, has_more=True)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def validate_checkpoint_json(self, checkpoint_json: str) -> ConfluenceCheckpoint:
|
||||||
|
return ConfluenceCheckpoint.model_validate_json(checkpoint_json)
|
||||||
|
|
||||||
def retrieve_all_slim_documents(
|
def retrieve_all_slim_documents(
|
||||||
self,
|
self,
|
||||||
start: SecondsSinceUnixEpoch | None = None,
|
start: SecondsSinceUnixEpoch | None = None,
|
||||||
|
@ -6,7 +6,7 @@ from typing import Generic
|
|||||||
from typing import TypeVar
|
from typing import TypeVar
|
||||||
|
|
||||||
from onyx.connectors.interfaces import BaseConnector
|
from onyx.connectors.interfaces import BaseConnector
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CheckpointOutput
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
from onyx.connectors.interfaces import LoadConnector
|
from onyx.connectors.interfaces import LoadConnector
|
||||||
from onyx.connectors.interfaces import PollConnector
|
from onyx.connectors.interfaces import PollConnector
|
||||||
@ -95,9 +95,9 @@ class ConnectorRunner(Generic[CT]):
|
|||||||
]:
|
]:
|
||||||
"""Adds additional exception logging to the connector."""
|
"""Adds additional exception logging to the connector."""
|
||||||
try:
|
try:
|
||||||
if isinstance(self.connector, CheckpointConnector):
|
if isinstance(self.connector, CheckpointedConnector):
|
||||||
if self.time_range is None:
|
if self.time_range is None:
|
||||||
raise ValueError("time_range is required for CheckpointConnector")
|
raise ValueError("time_range is required for CheckpointedConnector")
|
||||||
|
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
checkpoint_connector_generator = self.connector.load_from_checkpoint(
|
checkpoint_connector_generator = self.connector.load_from_checkpoint(
|
||||||
|
@ -34,7 +34,7 @@ from onyx.connectors.guru.connector import GuruConnector
|
|||||||
from onyx.connectors.highspot.connector import HighspotConnector
|
from onyx.connectors.highspot.connector import HighspotConnector
|
||||||
from onyx.connectors.hubspot.connector import HubSpotConnector
|
from onyx.connectors.hubspot.connector import HubSpotConnector
|
||||||
from onyx.connectors.interfaces import BaseConnector
|
from onyx.connectors.interfaces import BaseConnector
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CredentialsConnector
|
from onyx.connectors.interfaces import CredentialsConnector
|
||||||
from onyx.connectors.interfaces import EventConnector
|
from onyx.connectors.interfaces import EventConnector
|
||||||
from onyx.connectors.interfaces import LoadConnector
|
from onyx.connectors.interfaces import LoadConnector
|
||||||
@ -148,7 +148,7 @@ def identify_connector_class(
|
|||||||
# all connectors should be checkpoint connectors
|
# all connectors should be checkpoint connectors
|
||||||
and (
|
and (
|
||||||
not issubclass(connector, PollConnector)
|
not issubclass(connector, PollConnector)
|
||||||
and not issubclass(connector, CheckpointConnector)
|
and not issubclass(connector, CheckpointedConnector)
|
||||||
)
|
)
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
|
@ -25,7 +25,7 @@ from onyx.connectors.exceptions import ConnectorValidationError
|
|||||||
from onyx.connectors.exceptions import CredentialExpiredError
|
from onyx.connectors.exceptions import CredentialExpiredError
|
||||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||||
from onyx.connectors.exceptions import UnexpectedValidationError
|
from onyx.connectors.exceptions import UnexpectedValidationError
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CheckpointOutput
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
from onyx.connectors.interfaces import ConnectorCheckpoint
|
from onyx.connectors.interfaces import ConnectorCheckpoint
|
||||||
from onyx.connectors.interfaces import ConnectorFailure
|
from onyx.connectors.interfaces import ConnectorFailure
|
||||||
@ -143,7 +143,7 @@ class GithubConnectorCheckpoint(ConnectorCheckpoint):
|
|||||||
cached_repo: SerializedRepository | None = None
|
cached_repo: SerializedRepository | None = None
|
||||||
|
|
||||||
|
|
||||||
class GithubConnector(CheckpointConnector[GithubConnectorCheckpoint]):
|
class GithubConnector(CheckpointedConnector[GithubConnectorCheckpoint]):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
repo_owner: str,
|
repo_owner: str,
|
||||||
|
@ -52,7 +52,7 @@ from onyx.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_S
|
|||||||
from onyx.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS
|
from onyx.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS
|
||||||
from onyx.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
|
from onyx.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
|
||||||
from onyx.connectors.google_utils.shared_constants import USER_FIELDS
|
from onyx.connectors.google_utils.shared_constants import USER_FIELDS
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CheckpointOutput
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
@ -165,7 +165,7 @@ class DriveIdStatus(Enum):
|
|||||||
FINISHED = "finished"
|
FINISHED = "finished"
|
||||||
|
|
||||||
|
|
||||||
class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpoint]):
|
class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheckpoint]):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
include_shared_drives: bool = False,
|
include_shared_drives: bool = False,
|
||||||
|
@ -201,7 +201,7 @@ class EventConnector(BaseConnector):
|
|||||||
CheckpointOutput: TypeAlias = Generator[Document | ConnectorFailure, None, CT]
|
CheckpointOutput: TypeAlias = Generator[Document | ConnectorFailure, None, CT]
|
||||||
|
|
||||||
|
|
||||||
class CheckpointConnector(BaseConnector[CT]):
|
class CheckpointedConnector(BaseConnector[CT]):
|
||||||
@abc.abstractmethod
|
@abc.abstractmethod
|
||||||
def load_from_checkpoint(
|
def load_from_checkpoint(
|
||||||
self,
|
self,
|
||||||
|
@ -4,7 +4,7 @@ import httpx
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CheckpointOutput
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
from onyx.connectors.models import ConnectorCheckpoint
|
from onyx.connectors.models import ConnectorCheckpoint
|
||||||
@ -27,7 +27,7 @@ class SingleConnectorYield(BaseModel):
|
|||||||
unhandled_exception: str | None = None
|
unhandled_exception: str | None = None
|
||||||
|
|
||||||
|
|
||||||
class MockConnector(CheckpointConnector[MockConnectorCheckpoint]):
|
class MockConnector(CheckpointedConnector[MockConnectorCheckpoint]):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
mock_server_host: str,
|
mock_server_host: str,
|
||||||
|
@ -16,7 +16,7 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_t
|
|||||||
from onyx.connectors.exceptions import ConnectorValidationError
|
from onyx.connectors.exceptions import ConnectorValidationError
|
||||||
from onyx.connectors.exceptions import CredentialExpiredError
|
from onyx.connectors.exceptions import CredentialExpiredError
|
||||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CheckpointOutput
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
@ -150,7 +150,7 @@ class JiraConnectorCheckpoint(ConnectorCheckpoint):
|
|||||||
offset: int | None = None
|
offset: int | None = None
|
||||||
|
|
||||||
|
|
||||||
class JiraConnector(CheckpointConnector[JiraConnectorCheckpoint], SlimConnector):
|
class JiraConnector(CheckpointedConnector[JiraConnectorCheckpoint], SlimConnector):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
jira_base_url: str,
|
jira_base_url: str,
|
||||||
|
@ -26,7 +26,7 @@ from onyx.connectors.exceptions import ConnectorValidationError
|
|||||||
from onyx.connectors.exceptions import CredentialExpiredError
|
from onyx.connectors.exceptions import CredentialExpiredError
|
||||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||||
from onyx.connectors.exceptions import UnexpectedValidationError
|
from onyx.connectors.exceptions import UnexpectedValidationError
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CheckpointOutput
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
from onyx.connectors.interfaces import CredentialsConnector
|
from onyx.connectors.interfaces import CredentialsConnector
|
||||||
from onyx.connectors.interfaces import CredentialsProviderInterface
|
from onyx.connectors.interfaces import CredentialsProviderInterface
|
||||||
@ -501,7 +501,7 @@ def _process_message(
|
|||||||
|
|
||||||
|
|
||||||
class SlackConnector(
|
class SlackConnector(
|
||||||
SlimConnector, CredentialsConnector, CheckpointConnector[SlackCheckpoint]
|
SlimConnector, CredentialsConnector, CheckpointedConnector[SlackCheckpoint]
|
||||||
):
|
):
|
||||||
FAST_TIMEOUT = 1
|
FAST_TIMEOUT = 1
|
||||||
|
|
||||||
|
@ -401,6 +401,7 @@ class WebConnector(LoadConnector):
|
|||||||
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
scroll_before_scraping: bool = False,
|
scroll_before_scraping: bool = False,
|
||||||
|
add_randomness: bool = True,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.mintlify_cleanup = mintlify_cleanup
|
self.mintlify_cleanup = mintlify_cleanup
|
||||||
@ -408,7 +409,7 @@ class WebConnector(LoadConnector):
|
|||||||
self.recursive = False
|
self.recursive = False
|
||||||
self.scroll_before_scraping = scroll_before_scraping
|
self.scroll_before_scraping = scroll_before_scraping
|
||||||
self.web_connector_type = web_connector_type
|
self.web_connector_type = web_connector_type
|
||||||
|
self.add_randomness = add_randomness
|
||||||
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
|
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
|
||||||
self.recursive = True
|
self.recursive = True
|
||||||
self.to_visit_list = [_ensure_valid_url(base_url)]
|
self.to_visit_list = [_ensure_valid_url(base_url)]
|
||||||
@ -540,8 +541,11 @@ class WebConnector(LoadConnector):
|
|||||||
|
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
# Add random mouse movements and scrolling to mimic human behavior
|
if self.add_randomness:
|
||||||
page.mouse.move(random.randint(100, 700), random.randint(100, 500))
|
# Add random mouse movements and scrolling to mimic human behavior
|
||||||
|
page.mouse.move(
|
||||||
|
random.randint(100, 700), random.randint(100, 500)
|
||||||
|
)
|
||||||
|
|
||||||
# Can't use wait_until="networkidle" because it interferes with the scrolling behavior
|
# Can't use wait_until="networkidle" because it interferes with the scrolling behavior
|
||||||
page_response = page.goto(
|
page_response = page.goto(
|
||||||
|
@ -17,7 +17,7 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
|||||||
from onyx.connectors.exceptions import ConnectorValidationError
|
from onyx.connectors.exceptions import ConnectorValidationError
|
||||||
from onyx.connectors.exceptions import CredentialExpiredError
|
from onyx.connectors.exceptions import CredentialExpiredError
|
||||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import CheckpointOutput
|
from onyx.connectors.interfaces import CheckpointOutput
|
||||||
from onyx.connectors.interfaces import ConnectorFailure
|
from onyx.connectors.interfaces import ConnectorFailure
|
||||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||||
@ -353,7 +353,9 @@ class ZendeskConnectorCheckpoint(ConnectorCheckpoint):
|
|||||||
cached_content_tags: dict[str, str] | None
|
cached_content_tags: dict[str, str] | None
|
||||||
|
|
||||||
|
|
||||||
class ZendeskConnector(SlimConnector, CheckpointConnector[ZendeskConnectorCheckpoint]):
|
class ZendeskConnector(
|
||||||
|
SlimConnector, CheckpointedConnector[ZendeskConnectorCheckpoint]
|
||||||
|
):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
content_type: str = "articles",
|
content_type: str = "articles",
|
||||||
|
@ -11,6 +11,7 @@ from onyx.connectors.confluence.connector import ConfluenceConnector
|
|||||||
from onyx.connectors.confluence.utils import AttachmentProcessingResult
|
from onyx.connectors.confluence.utils import AttachmentProcessingResult
|
||||||
from onyx.connectors.credentials_provider import OnyxStaticCredentialsProvider
|
from onyx.connectors.credentials_provider import OnyxStaticCredentialsProvider
|
||||||
from onyx.connectors.models import Document
|
from onyx.connectors.models import Document
|
||||||
|
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -43,11 +44,9 @@ def test_confluence_connector_basic(
|
|||||||
mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector
|
mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector
|
||||||
) -> None:
|
) -> None:
|
||||||
confluence_connector.set_allow_images(False)
|
confluence_connector.set_allow_images(False)
|
||||||
doc_batch_generator = confluence_connector.poll_source(0, time.time())
|
doc_batch = load_all_docs_from_checkpoint_connector(
|
||||||
|
confluence_connector, 0, time.time()
|
||||||
doc_batch = next(doc_batch_generator)
|
)
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(doc_batch_generator)
|
|
||||||
|
|
||||||
assert len(doc_batch) == 2
|
assert len(doc_batch) == 2
|
||||||
|
|
||||||
@ -105,11 +104,9 @@ def test_confluence_connector_skip_images(
|
|||||||
mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector
|
mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector
|
||||||
) -> None:
|
) -> None:
|
||||||
confluence_connector.set_allow_images(False)
|
confluence_connector.set_allow_images(False)
|
||||||
doc_batch_generator = confluence_connector.poll_source(0, time.time())
|
doc_batch = load_all_docs_from_checkpoint_connector(
|
||||||
|
confluence_connector, 0, time.time()
|
||||||
doc_batch = next(doc_batch_generator)
|
)
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(doc_batch_generator)
|
|
||||||
|
|
||||||
assert len(doc_batch) == 8
|
assert len(doc_batch) == 8
|
||||||
assert sum(len(doc.sections) for doc in doc_batch) == 8
|
assert sum(len(doc.sections) for doc in doc_batch) == 8
|
||||||
@ -144,11 +141,9 @@ def test_confluence_connector_allow_images(
|
|||||||
) -> None:
|
) -> None:
|
||||||
confluence_connector.set_allow_images(True)
|
confluence_connector.set_allow_images(True)
|
||||||
|
|
||||||
doc_batch_generator = confluence_connector.poll_source(0, time.time())
|
doc_batch = load_all_docs_from_checkpoint_connector(
|
||||||
|
confluence_connector, 0, time.time()
|
||||||
doc_batch = next(doc_batch_generator)
|
)
|
||||||
with pytest.raises(StopIteration):
|
|
||||||
next(doc_batch_generator)
|
|
||||||
|
|
||||||
assert len(doc_batch) == 8
|
assert len(doc_batch) == 8
|
||||||
assert sum(len(doc.sections) for doc in doc_batch) == 12
|
assert sum(len(doc.sections) for doc in doc_batch) == 12
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from onyx.configs.constants import DocumentSource
|
from onyx.configs.constants import DocumentSource
|
||||||
from onyx.connectors.confluence.connector import ConfluenceConnector
|
from onyx.connectors.confluence.connector import ConfluenceConnector
|
||||||
from onyx.connectors.credentials_provider import OnyxStaticCredentialsProvider
|
from onyx.connectors.credentials_provider import OnyxStaticCredentialsProvider
|
||||||
|
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -34,8 +36,10 @@ def test_confluence_connector_permissions(
|
|||||||
) -> None:
|
) -> None:
|
||||||
# Get all doc IDs from the full connector
|
# Get all doc IDs from the full connector
|
||||||
all_full_doc_ids = set()
|
all_full_doc_ids = set()
|
||||||
for doc_batch in confluence_connector.load_from_state():
|
doc_batch = load_all_docs_from_checkpoint_connector(
|
||||||
all_full_doc_ids.update([doc.id for doc in doc_batch])
|
confluence_connector, 0, time.time()
|
||||||
|
)
|
||||||
|
all_full_doc_ids.update([doc.id for doc in doc_batch])
|
||||||
|
|
||||||
# Get all doc IDs from the slim connector
|
# Get all doc IDs from the slim connector
|
||||||
all_slim_doc_ids = set()
|
all_slim_doc_ids = set()
|
||||||
|
@ -2,7 +2,7 @@ from typing import cast
|
|||||||
from typing import TypeVar
|
from typing import TypeVar
|
||||||
|
|
||||||
from onyx.connectors.connector_runner import CheckpointOutputWrapper
|
from onyx.connectors.connector_runner import CheckpointOutputWrapper
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
from onyx.connectors.models import ConnectorCheckpoint
|
from onyx.connectors.models import ConnectorCheckpoint
|
||||||
from onyx.connectors.models import ConnectorFailure
|
from onyx.connectors.models import ConnectorFailure
|
||||||
@ -14,7 +14,7 @@ CT = TypeVar("CT", bound=ConnectorCheckpoint)
|
|||||||
|
|
||||||
|
|
||||||
def load_all_docs_from_checkpoint_connector(
|
def load_all_docs_from_checkpoint_connector(
|
||||||
connector: CheckpointConnector[CT],
|
connector: CheckpointedConnector[CT],
|
||||||
start: SecondsSinceUnixEpoch,
|
start: SecondsSinceUnixEpoch,
|
||||||
end: SecondsSinceUnixEpoch,
|
end: SecondsSinceUnixEpoch,
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
@ -42,7 +42,7 @@ def load_all_docs_from_checkpoint_connector(
|
|||||||
|
|
||||||
|
|
||||||
def load_everything_from_checkpoint_connector(
|
def load_everything_from_checkpoint_connector(
|
||||||
connector: CheckpointConnector[CT],
|
connector: CheckpointedConnector[CT],
|
||||||
start: SecondsSinceUnixEpoch,
|
start: SecondsSinceUnixEpoch,
|
||||||
end: SecondsSinceUnixEpoch,
|
end: SecondsSinceUnixEpoch,
|
||||||
) -> list[Document | ConnectorFailure]:
|
) -> list[Document | ConnectorFailure]:
|
||||||
|
@ -18,6 +18,7 @@ def web_connector(request: pytest.FixtureRequest) -> WebConnector:
|
|||||||
base_url="https://quotes.toscrape.com/scroll",
|
base_url="https://quotes.toscrape.com/scroll",
|
||||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||||
scroll_before_scraping=scroll_before_scraping,
|
scroll_before_scraping=scroll_before_scraping,
|
||||||
|
add_randomness=False,
|
||||||
)
|
)
|
||||||
return connector
|
return connector
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ import os
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
|
|
||||||
|
from onyx.connectors.models import InputType
|
||||||
from onyx.server.documents.models import DocumentSource
|
from onyx.server.documents.models import DocumentSource
|
||||||
from tests.integration.common_utils.managers.cc_pair import CCPairManager
|
from tests.integration.common_utils.managers.cc_pair import CCPairManager
|
||||||
from tests.integration.common_utils.managers.user import UserManager
|
from tests.integration.common_utils.managers.user import UserManager
|
||||||
@ -56,6 +57,7 @@ def test_overlapping_connector_creation(reset: None) -> None:
|
|||||||
connector_specific_config=config,
|
connector_specific_config=config,
|
||||||
credential_json=credential,
|
credential_json=credential,
|
||||||
user_performing_action=admin_user,
|
user_performing_action=admin_user,
|
||||||
|
input_type=InputType.POLL,
|
||||||
)
|
)
|
||||||
|
|
||||||
CCPairManager.wait_for_indexing_completion(
|
CCPairManager.wait_for_indexing_completion(
|
||||||
@ -69,6 +71,7 @@ def test_overlapping_connector_creation(reset: None) -> None:
|
|||||||
connector_specific_config=config,
|
connector_specific_config=config,
|
||||||
credential_json=credential,
|
credential_json=credential,
|
||||||
user_performing_action=admin_user,
|
user_performing_action=admin_user,
|
||||||
|
input_type=InputType.POLL,
|
||||||
)
|
)
|
||||||
|
|
||||||
CCPairManager.wait_for_indexing_completion(
|
CCPairManager.wait_for_indexing_completion(
|
||||||
@ -115,6 +118,7 @@ def test_connector_pause_while_indexing(reset: None) -> None:
|
|||||||
connector_specific_config=config,
|
connector_specific_config=config,
|
||||||
credential_json=credential,
|
credential_json=credential,
|
||||||
user_performing_action=admin_user,
|
user_performing_action=admin_user,
|
||||||
|
input_type=InputType.POLL,
|
||||||
)
|
)
|
||||||
|
|
||||||
CCPairManager.wait_for_indexing_in_progress(
|
CCPairManager.wait_for_indexing_in_progress(
|
||||||
|
@ -0,0 +1,383 @@
|
|||||||
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
|
from collections.abc import Generator
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
|
from typing import Any
|
||||||
|
from typing import cast
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
|
from onyx.configs.constants import DocumentSource
|
||||||
|
from onyx.connectors.confluence.connector import ConfluenceCheckpoint
|
||||||
|
from onyx.connectors.confluence.connector import ConfluenceConnector
|
||||||
|
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
|
||||||
|
from onyx.connectors.exceptions import CredentialExpiredError
|
||||||
|
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||||
|
from onyx.connectors.exceptions import UnexpectedValidationError
|
||||||
|
from onyx.connectors.models import ConnectorFailure
|
||||||
|
from onyx.connectors.models import Document
|
||||||
|
from onyx.connectors.models import DocumentFailure
|
||||||
|
from onyx.connectors.models import SlimDocument
|
||||||
|
from tests.unit.onyx.connectors.utils import load_everything_from_checkpoint_connector
|
||||||
|
|
||||||
|
PAGE_SIZE = 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def confluence_base_url() -> str:
|
||||||
|
return "https://example.atlassian.net/wiki"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def space_key() -> str:
|
||||||
|
return "TEST"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_confluence_client() -> MagicMock:
|
||||||
|
"""Create a mock Confluence client with proper typing"""
|
||||||
|
mock = MagicMock(spec=OnyxConfluence)
|
||||||
|
# Initialize with empty results for common methods
|
||||||
|
mock.paginated_cql_retrieval.return_value = []
|
||||||
|
mock.get_all_spaces = MagicMock()
|
||||||
|
mock.get_all_spaces.return_value = {"results": []}
|
||||||
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def confluence_connector(
|
||||||
|
confluence_base_url: str, space_key: str, mock_confluence_client: MagicMock
|
||||||
|
) -> Generator[ConfluenceConnector, None, None]:
|
||||||
|
"""Create a Confluence connector with a mock client"""
|
||||||
|
connector = ConfluenceConnector(
|
||||||
|
wiki_base=confluence_base_url,
|
||||||
|
space=space_key,
|
||||||
|
is_cloud=True,
|
||||||
|
labels_to_skip=["secret", "sensitive"],
|
||||||
|
timezone_offset=0.0,
|
||||||
|
batch_size=2,
|
||||||
|
)
|
||||||
|
# Initialize the client directly
|
||||||
|
connector._confluence_client = mock_confluence_client
|
||||||
|
with patch("onyx.connectors.confluence.connector._SLIM_DOC_BATCH_SIZE", 2):
|
||||||
|
yield connector
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def create_mock_page() -> Callable[..., dict[str, Any]]:
|
||||||
|
def _create_mock_page(
|
||||||
|
id: str = "123",
|
||||||
|
title: str = "Test Page",
|
||||||
|
updated: str = "2023-01-01T12:00:00.000+0000",
|
||||||
|
content: str = "Test Content",
|
||||||
|
labels: list[str] | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Helper to create a mock Confluence page object"""
|
||||||
|
return {
|
||||||
|
"id": id,
|
||||||
|
"title": title,
|
||||||
|
"version": {"when": updated},
|
||||||
|
"body": {"storage": {"value": content}},
|
||||||
|
"metadata": {
|
||||||
|
"labels": {"results": [{"name": label} for label in (labels or [])]}
|
||||||
|
},
|
||||||
|
"space": {"key": "TEST"},
|
||||||
|
"_links": {"webui": f"/spaces/TEST/pages/{id}"},
|
||||||
|
}
|
||||||
|
|
||||||
|
return _create_mock_page
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_cql_query_with_space(confluence_connector: ConfluenceConnector) -> None:
|
||||||
|
"""Test CQL query generation with space specified"""
|
||||||
|
start = datetime(2023, 1, 1, tzinfo=timezone.utc).timestamp()
|
||||||
|
end = datetime(2023, 1, 2, tzinfo=timezone.utc).timestamp()
|
||||||
|
|
||||||
|
query = confluence_connector._construct_page_query(start, end)
|
||||||
|
|
||||||
|
# Check that the space part and time part are both in the query
|
||||||
|
assert f"space='{confluence_connector.space}'" in query
|
||||||
|
assert "lastmodified >= '2023-01-01 00:00'" in query
|
||||||
|
assert "lastmodified <= '2023-01-02 00:00'" in query
|
||||||
|
assert " and " in query.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_cql_query_without_space(confluence_base_url: str) -> None:
|
||||||
|
"""Test CQL query generation without space specified"""
|
||||||
|
# Create connector without space key
|
||||||
|
connector = ConfluenceConnector(wiki_base=confluence_base_url, is_cloud=True)
|
||||||
|
|
||||||
|
start = datetime(2023, 1, 1, tzinfo=connector.timezone).timestamp()
|
||||||
|
end = datetime(2023, 1, 2, tzinfo=connector.timezone).timestamp()
|
||||||
|
|
||||||
|
query = connector._construct_page_query(start, end)
|
||||||
|
|
||||||
|
# Check that only time part is in the query
|
||||||
|
assert "space=" not in query
|
||||||
|
assert "lastmodified >= '2023-01-01 00:00'" in query
|
||||||
|
assert "lastmodified <= '2023-01-02 00:00'" in query
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_from_checkpoint_happy_path(
|
||||||
|
confluence_connector: ConfluenceConnector,
|
||||||
|
create_mock_page: Callable[..., dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
"""Test loading from checkpoint - happy path"""
|
||||||
|
# Set up mocked pages
|
||||||
|
first_updated = datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
|
||||||
|
last_updated = datetime(2023, 1, 3, 12, 0, tzinfo=timezone.utc)
|
||||||
|
mock_page1 = create_mock_page(
|
||||||
|
id="1", title="Page 1", updated=first_updated.isoformat()
|
||||||
|
)
|
||||||
|
mock_page2 = create_mock_page(
|
||||||
|
id="2", title="Page 2", updated=first_updated.isoformat()
|
||||||
|
)
|
||||||
|
mock_page3 = create_mock_page(
|
||||||
|
id="3", title="Page 3", updated=last_updated.isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock paginated_cql_retrieval to return our mock pages
|
||||||
|
confluence_client = confluence_connector._confluence_client
|
||||||
|
assert confluence_client is not None, "bad test setup"
|
||||||
|
paginated_cql_mock = cast(MagicMock, confluence_client.paginated_cql_retrieval)
|
||||||
|
paginated_cql_mock.side_effect = [
|
||||||
|
[mock_page1, mock_page2],
|
||||||
|
[], # comments
|
||||||
|
[], # attachments
|
||||||
|
[], # comments
|
||||||
|
[], # attachments
|
||||||
|
[mock_page3],
|
||||||
|
[], # comments
|
||||||
|
[], # attachments
|
||||||
|
]
|
||||||
|
|
||||||
|
# Call load_from_checkpoint
|
||||||
|
end_time = time.time()
|
||||||
|
outputs = load_everything_from_checkpoint_connector(
|
||||||
|
confluence_connector, 0, end_time
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check that the documents were returned
|
||||||
|
assert len(outputs) == 2
|
||||||
|
|
||||||
|
checkpoint_output1 = outputs[0]
|
||||||
|
assert len(checkpoint_output1.items) == 2
|
||||||
|
document1 = checkpoint_output1.items[0]
|
||||||
|
assert isinstance(document1, Document)
|
||||||
|
assert document1.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/1"
|
||||||
|
document2 = checkpoint_output1.items[1]
|
||||||
|
assert isinstance(document2, Document)
|
||||||
|
assert document2.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/2"
|
||||||
|
assert checkpoint_output1.next_checkpoint == ConfluenceCheckpoint(
|
||||||
|
last_updated=first_updated.timestamp(),
|
||||||
|
has_more=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
checkpoint_output2 = outputs[1]
|
||||||
|
assert len(checkpoint_output2.items) == 1
|
||||||
|
document3 = checkpoint_output2.items[0]
|
||||||
|
assert isinstance(document3, Document)
|
||||||
|
assert document3.id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/3"
|
||||||
|
assert checkpoint_output2.next_checkpoint == ConfluenceCheckpoint(
|
||||||
|
last_updated=last_updated.timestamp(),
|
||||||
|
has_more=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_from_checkpoint_with_page_processing_error(
|
||||||
|
confluence_connector: ConfluenceConnector,
|
||||||
|
create_mock_page: Callable[..., dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
"""Test loading from checkpoint with a mix of successful and failed page processing"""
|
||||||
|
# Set up mocked pages
|
||||||
|
mock_page1 = create_mock_page(id="1", title="Page 1")
|
||||||
|
mock_page2 = create_mock_page(id="2", title="Page 2")
|
||||||
|
|
||||||
|
# Mock paginated_cql_retrieval to return our mock pages
|
||||||
|
confluence_client = confluence_connector._confluence_client
|
||||||
|
assert confluence_client is not None, "bad test setup"
|
||||||
|
paginated_cql_mock = cast(MagicMock, confluence_client.paginated_cql_retrieval)
|
||||||
|
paginated_cql_mock.return_value = [mock_page1, mock_page2]
|
||||||
|
|
||||||
|
# Mock _convert_page_to_document to fail for the second page
|
||||||
|
def mock_convert_side_effect(page: dict[str, Any]) -> Document | ConnectorFailure:
|
||||||
|
if page["id"] == "1":
|
||||||
|
return Document(
|
||||||
|
id=f"{confluence_connector.wiki_base}/spaces/TEST/pages/1",
|
||||||
|
sections=[],
|
||||||
|
source=DocumentSource.CONFLUENCE,
|
||||||
|
semantic_identifier="Page 1",
|
||||||
|
metadata={},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return ConnectorFailure(
|
||||||
|
failed_document=DocumentFailure(
|
||||||
|
document_id=page["id"],
|
||||||
|
document_link=f"{confluence_connector.wiki_base}/spaces/TEST/pages/{page['id']}",
|
||||||
|
),
|
||||||
|
failure_message="Failed to process Confluence page",
|
||||||
|
exception=Exception("Test error"),
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"onyx.connectors.confluence.connector.ConfluenceConnector._convert_page_to_document",
|
||||||
|
side_effect=mock_convert_side_effect,
|
||||||
|
):
|
||||||
|
# Call load_from_checkpoint
|
||||||
|
end_time = time.time()
|
||||||
|
outputs = load_everything_from_checkpoint_connector(
|
||||||
|
confluence_connector, 0, end_time
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(outputs) == 1
|
||||||
|
checkpoint_output = outputs[0]
|
||||||
|
assert len(checkpoint_output.items) == 2
|
||||||
|
|
||||||
|
# First item should be successful
|
||||||
|
assert isinstance(checkpoint_output.items[0], Document)
|
||||||
|
assert (
|
||||||
|
checkpoint_output.items[0].id
|
||||||
|
== f"{confluence_connector.wiki_base}/spaces/TEST/pages/1"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Second item should be a failure
|
||||||
|
assert isinstance(checkpoint_output.items[1], ConnectorFailure)
|
||||||
|
assert (
|
||||||
|
"Failed to process Confluence page"
|
||||||
|
in checkpoint_output.items[1].failure_message
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrieve_all_slim_documents(
|
||||||
|
confluence_connector: ConfluenceConnector,
|
||||||
|
create_mock_page: Callable[..., dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
"""Test retrieving all slim documents"""
|
||||||
|
# Set up mocked pages
|
||||||
|
mock_page1 = create_mock_page(id="1")
|
||||||
|
mock_page2 = create_mock_page(id="2")
|
||||||
|
|
||||||
|
# Mock paginated_cql_retrieval to return our mock pages
|
||||||
|
confluence_client = confluence_connector._confluence_client
|
||||||
|
assert confluence_client is not None, "bad test setup"
|
||||||
|
|
||||||
|
paginated_cql_mock = cast(MagicMock, confluence_client.cql_paginate_all_expansions)
|
||||||
|
paginated_cql_mock.side_effect = [[mock_page1, mock_page2], [], []]
|
||||||
|
|
||||||
|
# Call retrieve_all_slim_documents
|
||||||
|
batches = list(confluence_connector.retrieve_all_slim_documents(0, 100))
|
||||||
|
assert paginated_cql_mock.call_count == 3
|
||||||
|
|
||||||
|
# Check that a batch with 2 documents was returned
|
||||||
|
assert len(batches) == 1
|
||||||
|
assert len(batches[0]) == 2
|
||||||
|
assert isinstance(batches[0][0], SlimDocument)
|
||||||
|
assert batches[0][0].id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/1"
|
||||||
|
assert batches[0][1].id == f"{confluence_connector.wiki_base}/spaces/TEST/pages/2"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"status_code,expected_exception,expected_message",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
401,
|
||||||
|
CredentialExpiredError,
|
||||||
|
"Invalid or expired Confluence credentials",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
403,
|
||||||
|
InsufficientPermissionsError,
|
||||||
|
"Insufficient permissions to access Confluence resources",
|
||||||
|
),
|
||||||
|
(404, UnexpectedValidationError, "Unexpected Confluence error"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_validate_connector_settings_errors(
|
||||||
|
confluence_connector: ConfluenceConnector,
|
||||||
|
status_code: int,
|
||||||
|
expected_exception: type[Exception],
|
||||||
|
expected_message: str,
|
||||||
|
) -> None:
|
||||||
|
"""Test validation with various error scenarios"""
|
||||||
|
error = HTTPError(response=MagicMock(status_code=status_code))
|
||||||
|
|
||||||
|
confluence_client = MagicMock()
|
||||||
|
confluence_connector._low_timeout_confluence_client = confluence_client
|
||||||
|
get_all_spaces_mock = cast(MagicMock, confluence_client.get_all_spaces)
|
||||||
|
get_all_spaces_mock.side_effect = error
|
||||||
|
|
||||||
|
with pytest.raises(expected_exception) as excinfo:
|
||||||
|
confluence_connector.validate_connector_settings()
|
||||||
|
assert expected_message in str(excinfo.value)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_connector_settings_success(
|
||||||
|
confluence_connector: ConfluenceConnector,
|
||||||
|
) -> None:
|
||||||
|
"""Test successful validation"""
|
||||||
|
confluence_client = MagicMock()
|
||||||
|
confluence_connector._low_timeout_confluence_client = confluence_client
|
||||||
|
get_all_spaces_mock = cast(MagicMock, confluence_client.get_all_spaces)
|
||||||
|
get_all_spaces_mock.return_value = {"results": [{"key": "TEST"}]}
|
||||||
|
|
||||||
|
confluence_connector.validate_connector_settings()
|
||||||
|
get_all_spaces_mock.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
def test_checkpoint_progress(
|
||||||
|
confluence_connector: ConfluenceConnector,
|
||||||
|
create_mock_page: Callable[..., dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
"""Test that the checkpoint's last_updated field is properly updated after processing pages"""
|
||||||
|
# Set up mocked pages with different timestamps
|
||||||
|
earlier_timestamp = datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
|
||||||
|
later_timestamp = datetime(2023, 1, 2, 12, 0, tzinfo=timezone.utc)
|
||||||
|
mock_page1 = create_mock_page(
|
||||||
|
id="1", title="Page 1", updated=earlier_timestamp.isoformat()
|
||||||
|
)
|
||||||
|
mock_page2 = create_mock_page(
|
||||||
|
id="2", title="Page 2", updated=later_timestamp.isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mock paginated_cql_retrieval to return our mock pages
|
||||||
|
confluence_client = confluence_connector._confluence_client
|
||||||
|
assert confluence_client is not None, "bad test setup"
|
||||||
|
paginated_cql_mock = cast(MagicMock, confluence_client.paginated_cql_retrieval)
|
||||||
|
paginated_cql_mock.side_effect = [
|
||||||
|
[mock_page1, mock_page2], # Return both pages
|
||||||
|
[], # No comments for page 1
|
||||||
|
[], # No attachments for page 1
|
||||||
|
[], # No comments for page 2
|
||||||
|
[], # No attachments for page 2
|
||||||
|
[], # No more pages
|
||||||
|
]
|
||||||
|
|
||||||
|
# Call load_from_checkpoint
|
||||||
|
end_time = datetime(2023, 1, 3, tzinfo=timezone.utc).timestamp()
|
||||||
|
|
||||||
|
outputs = load_everything_from_checkpoint_connector(
|
||||||
|
confluence_connector, 0, end_time
|
||||||
|
)
|
||||||
|
|
||||||
|
last_checkpoint = outputs[-1].next_checkpoint
|
||||||
|
|
||||||
|
assert last_checkpoint == ConfluenceCheckpoint(
|
||||||
|
last_updated=later_timestamp.timestamp(),
|
||||||
|
has_more=False,
|
||||||
|
)
|
||||||
|
# Convert the expected timestamp to epoch seconds
|
||||||
|
expected_timestamp = datetime(2023, 1, 2, 12, 0, tzinfo=timezone.utc).timestamp()
|
||||||
|
|
||||||
|
# The checkpoint's last_updated should be set to the latest page's timestamp
|
||||||
|
assert last_checkpoint.last_updated == expected_timestamp
|
||||||
|
assert not last_checkpoint.has_more # No more pages to process
|
||||||
|
|
||||||
|
assert len(outputs) == 2
|
||||||
|
# Verify we got both documents
|
||||||
|
assert len(outputs[0].items) == 2
|
||||||
|
assert isinstance(outputs[0].items[0], Document)
|
||||||
|
assert isinstance(outputs[0].items[1], Document)
|
@ -5,7 +5,7 @@ from typing import TypeVar
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from onyx.connectors.connector_runner import CheckpointOutputWrapper
|
from onyx.connectors.connector_runner import CheckpointOutputWrapper
|
||||||
from onyx.connectors.interfaces import CheckpointConnector
|
from onyx.connectors.interfaces import CheckpointedConnector
|
||||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
from onyx.connectors.models import ConnectorCheckpoint
|
from onyx.connectors.models import ConnectorCheckpoint
|
||||||
from onyx.connectors.models import ConnectorFailure
|
from onyx.connectors.models import ConnectorFailure
|
||||||
@ -23,7 +23,7 @@ class SingleConnectorCallOutput(BaseModel, Generic[CT]):
|
|||||||
|
|
||||||
|
|
||||||
def load_everything_from_checkpoint_connector(
|
def load_everything_from_checkpoint_connector(
|
||||||
connector: CheckpointConnector[CT],
|
connector: CheckpointedConnector[CT],
|
||||||
start: SecondsSinceUnixEpoch,
|
start: SecondsSinceUnixEpoch,
|
||||||
end: SecondsSinceUnixEpoch,
|
end: SecondsSinceUnixEpoch,
|
||||||
) -> list[SingleConnectorCallOutput[CT]]:
|
) -> list[SingleConnectorCallOutput[CT]]:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user