Bugfix/confluence time zone (#3265)

* RedisLock typing

* checkpoint

* put in debug logging

* improve comments

* mypy fixes
This commit is contained in:
rkuo-danswer
2024-12-02 22:23:23 -08:00
committed by GitHub
parent 9e9b7ed61d
commit 5f28a1b0e4
5 changed files with 67 additions and 9 deletions

View File

@@ -11,6 +11,7 @@ from celery.exceptions import WorkerShutdown
from celery.states import READY_STATES from celery.states import READY_STATES
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
from celery.worker import strategy # type: ignore from celery.worker import strategy # type: ignore
from redis.lock import Lock as RedisLock
from sentry_sdk.integrations.celery import CeleryIntegration from sentry_sdk.integrations.celery import CeleryIntegration
from sqlalchemy import text from sqlalchemy import text
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@@ -332,16 +333,16 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
return return
logger.info("Releasing primary worker lock.") logger.info("Releasing primary worker lock.")
lock = sender.primary_worker_lock lock: RedisLock = sender.primary_worker_lock
try: try:
if lock.owned(): if lock.owned():
try: try:
lock.release() lock.release()
sender.primary_worker_lock = None sender.primary_worker_lock = None
except Exception as e: except Exception:
logger.error(f"Failed to release primary worker lock: {e}") logger.exception("Failed to release primary worker lock")
except Exception as e: except Exception:
logger.error(f"Failed to check if primary worker lock is owned: {e}") logger.exception("Failed to check if primary worker lock is owned")
def on_setup_logging( def on_setup_logging(

View File

@@ -11,6 +11,7 @@ from celery.signals import celeryd_init
from celery.signals import worker_init from celery.signals import worker_init
from celery.signals import worker_ready from celery.signals import worker_ready
from celery.signals import worker_shutdown from celery.signals import worker_shutdown
from redis.lock import Lock as RedisLock
import danswer.background.celery.apps.app_base as app_base import danswer.background.celery.apps.app_base as app_base
from danswer.background.celery.apps.app_base import task_logger from danswer.background.celery.apps.app_base import task_logger
@@ -116,7 +117,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
# it is planned to use this lock to enforce singleton behavior on the primary # it is planned to use this lock to enforce singleton behavior on the primary
# worker, since the primary worker does redis cleanup on startup, but this isn't # worker, since the primary worker does redis cleanup on startup, but this isn't
# implemented yet. # implemented yet.
lock = r.lock( lock: RedisLock = r.lock(
DanswerRedisLocks.PRIMARY_WORKER, DanswerRedisLocks.PRIMARY_WORKER,
timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT, timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
) )
@@ -227,7 +228,7 @@ class HubPeriodicTask(bootsteps.StartStopStep):
if not hasattr(worker, "primary_worker_lock"): if not hasattr(worker, "primary_worker_lock"):
return return
lock = worker.primary_worker_lock lock: RedisLock = worker.primary_worker_lock
r = get_redis_client(tenant_id=None) r = get_redis_client(tenant_id=None)

View File

@@ -308,6 +308,22 @@ CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD = int(
os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000) os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
) )
# Due to breakages in the confluence API, the timezone offset must be specified client side
# to match the user's specified timezone.
# The current state of affairs:
# CQL queries are parsed in the user's timezone and cannot be specified in UTC
# no API retrieves the user's timezone
# All data is returned in UTC, so we can't derive the user's timezone from that
# https://community.developer.atlassian.com/t/confluence-cloud-time-zone-get-via-rest-api/35954/16
# https://jira.atlassian.com/browse/CONFCLOUD-69670
# enter as a floating point offset from UTC in hours (-24 < val < 24)
# this will be applied globally, so it probably makes sense to transition this to per
# connector as some point.
CONFLUENCE_TIMEZONE_OFFSET = float(os.environ.get("CONFLUENCE_TIMEZONE_OFFSET", 1.0))
JIRA_CONNECTOR_LABELS_TO_SKIP = [ JIRA_CONNECTOR_LABELS_TO_SKIP = [
ignored_tag ignored_tag
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",") for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")

View File

@@ -1,9 +1,11 @@
from datetime import datetime from datetime import datetime
from datetime import timedelta
from datetime import timezone from datetime import timezone
from typing import Any from typing import Any
from urllib.parse import quote from urllib.parse import quote
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
@@ -69,6 +71,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
# skip it. This is generally used to avoid indexing extra sensitive # skip it. This is generally used to avoid indexing extra sensitive
# pages. # pages.
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP, labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
) -> None: ) -> None:
self.batch_size = batch_size self.batch_size = batch_size
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
@@ -104,6 +107,8 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
) )
self.cql_label_filter = f" and label not in ({comma_separated_labels})" self.cql_label_filter = f" and label not in ({comma_separated_labels})"
self.timezone: timezone = timezone(offset=timedelta(hours=timezone_offset))
@property @property
def confluence_client(self) -> OnyxConfluence: def confluence_client(self) -> OnyxConfluence:
if self._confluence_client is None: if self._confluence_client is None:
@@ -204,12 +209,14 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
confluence_page_ids: list[str] = [] confluence_page_ids: list[str] = []
page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter
logger.debug(f"page_query: {page_query}")
# Fetch pages as Documents # Fetch pages as Documents
for page in self.confluence_client.paginated_cql_retrieval( for page in self.confluence_client.paginated_cql_retrieval(
cql=page_query, cql=page_query,
expand=",".join(_PAGE_EXPANSION_FIELDS), expand=",".join(_PAGE_EXPANSION_FIELDS),
limit=self.batch_size, limit=self.batch_size,
): ):
logger.debug(f"_fetch_document_batches: {page['id']}")
confluence_page_ids.append(page["id"]) confluence_page_ids.append(page["id"])
doc = self._convert_object_to_document(page) doc = self._convert_object_to_document(page)
if doc is not None: if doc is not None:
@@ -242,10 +249,10 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput: def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput:
# Add time filters # Add time filters
formatted_start_time = datetime.fromtimestamp(start, tz=timezone.utc).strftime( formatted_start_time = datetime.fromtimestamp(start, tz=self.timezone).strftime(
"%Y-%m-%d %H:%M" "%Y-%m-%d %H:%M"
) )
formatted_end_time = datetime.fromtimestamp(end, tz=timezone.utc).strftime( formatted_end_time = datetime.fromtimestamp(end, tz=self.timezone).strftime(
"%Y-%m-%d %H:%M" "%Y-%m-%d %H:%M"
) )
self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'" self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'"

View File

@@ -134,6 +134,32 @@ class OnyxConfluence(Confluence):
super(OnyxConfluence, self).__init__(url, *args, **kwargs) super(OnyxConfluence, self).__init__(url, *args, **kwargs)
self._wrap_methods() self._wrap_methods()
def get_current_user(self, expand: str | None = None) -> Any:
"""
Implements a method that isn't in the third party client.
Get information about the current user
:param expand: OPTIONAL expand for get status of user.
Possible param is "status". Results are "Active, Deactivated"
:return: Returns the user details
"""
from atlassian.errors import ApiPermissionError # type:ignore
url = "rest/api/user/current"
params = {}
if expand:
params["expand"] = expand
try:
response = self.get(url, params=params)
except HTTPError as e:
if e.response.status_code == 403:
raise ApiPermissionError(
"The calling user does not have permission", reason=e
)
raise
return response
def _wrap_methods(self) -> None: def _wrap_methods(self) -> None:
""" """
For each attribute that is callable (i.e., a method) and doesn't start with an underscore, For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
@@ -306,6 +332,13 @@ def _validate_connector_configuration(
) )
spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1) spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1)
# uncomment the following for testing
# the following is an attempt to retrieve the user's timezone
# Unfornately, all data is returned in UTC regardless of the user's time zone
# even tho CQL parses incoming times based on the user's time zone
# space_key = spaces["results"][0]["key"]
# space_details = confluence_client_with_minimal_retries.cql(f"space.key={space_key}+AND+type=space")
if not spaces: if not spaces:
raise RuntimeError( raise RuntimeError(
f"No spaces found at {wiki_base}! " f"No spaces found at {wiki_base}! "