Bugfix/confluence time zone (#3265)

* RedisLock typing * checkpoint * put in debug logging * improve comments * mypy fixes
2025-09-27 20:38:32 +02:00 · 2024-12-02 22:23:23 -08:00
parent 9e9b7ed61d
commit 5f28a1b0e4
5 changed files with 67 additions and 9 deletions
--- a/backend/danswer/background/celery/apps/app_base.py
+++ b/backend/danswer/background/celery/apps/app_base.py
@@ -11,6 +11,7 @@ from celery.exceptions import WorkerShutdown
 from celery.states import READY_STATES
 from celery.utils.log import get_task_logger
 from celery.worker import strategy  # type: ignore
 from redis.lock import Lock as RedisLock
 from sentry_sdk.integrations.celery import CeleryIntegration
 from sqlalchemy import text
 from sqlalchemy.orm import Session
@@ -332,16 +333,16 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
        return
    logger.info("Releasing primary worker lock.")
-    lock = sender.primary_worker_lock
+    lock: RedisLock = sender.primary_worker_lock
    try:
        if lock.owned():
            try:
                lock.release()
                sender.primary_worker_lock = None
-            except Exception as e:
+            except Exception:
-                logger.error(f"Failed to release primary worker lock: {e}")
+                logger.exception("Failed to release primary worker lock")
-    except Exception as e:
+    except Exception:
-        logger.error(f"Failed to check if primary worker lock is owned: {e}")
+        logger.exception("Failed to check if primary worker lock is owned")
 def on_setup_logging(
--- a/backend/danswer/background/celery/apps/primary.py
+++ b/backend/danswer/background/celery/apps/primary.py
@@ -11,6 +11,7 @@ from celery.signals import celeryd_init
 from celery.signals import worker_init
 from celery.signals import worker_ready
 from celery.signals import worker_shutdown
 from redis.lock import Lock as RedisLock
 import danswer.background.celery.apps.app_base as app_base
 from danswer.background.celery.apps.app_base import task_logger
@@ -116,7 +117,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    # it is planned to use this lock to enforce singleton behavior on the primary
    # worker, since the primary worker does redis cleanup on startup, but this isn't
    # implemented yet.
-    lock = r.lock(
+    lock: RedisLock = r.lock(
        DanswerRedisLocks.PRIMARY_WORKER,
        timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
    )
@@ -227,7 +228,7 @@ class HubPeriodicTask(bootsteps.StartStopStep):
            if not hasattr(worker, "primary_worker_lock"):
                return
-            lock = worker.primary_worker_lock
+            lock: RedisLock = worker.primary_worker_lock
            r = get_redis_client(tenant_id=None)
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -308,6 +308,22 @@ CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD = int(
    os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
 )
 # Due to breakages in the confluence API, the timezone offset must be specified client side
 # to match the user's specified timezone.
 # The current state of affairs:
 # CQL queries are parsed in the user's timezone and cannot be specified in UTC
 # no API retrieves the user's timezone
 # All data is returned in UTC, so we can't derive the user's timezone from that
 # https://community.developer.atlassian.com/t/confluence-cloud-time-zone-get-via-rest-api/35954/16
 # https://jira.atlassian.com/browse/CONFCLOUD-69670
 # enter as a floating point offset from UTC in hours (-24 < val < 24)
 # this will be applied globally, so it probably makes sense to transition this to per
 # connector as some point.
 CONFLUENCE_TIMEZONE_OFFSET = float(os.environ.get("CONFLUENCE_TIMEZONE_OFFSET", 1.0))
 JIRA_CONNECTOR_LABELS_TO_SKIP = [
    ignored_tag
    for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -1,9 +1,11 @@
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
 from typing import Any
 from urllib.parse import quote
 from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
 from danswer.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
 from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
@@ -69,6 +71,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        # skip it. This is generally used to avoid indexing extra sensitive
        # pages.
        labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
        timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
    ) -> None:
        self.batch_size = batch_size
        self.continue_on_failure = continue_on_failure
@@ -104,6 +107,8 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
            )
            self.cql_label_filter = f" and label not in ({comma_separated_labels})"
        self.timezone: timezone = timezone(offset=timedelta(hours=timezone_offset))
    @property
    def confluence_client(self) -> OnyxConfluence:
        if self._confluence_client is None:
@@ -204,12 +209,14 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        confluence_page_ids: list[str] = []
        page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter
        logger.debug(f"page_query: {page_query}")
        # Fetch pages as Documents
        for page in self.confluence_client.paginated_cql_retrieval(
            cql=page_query,
            expand=",".join(_PAGE_EXPANSION_FIELDS),
            limit=self.batch_size,
        ):
            logger.debug(f"_fetch_document_batches: {page['id']}")
            confluence_page_ids.append(page["id"])
            doc = self._convert_object_to_document(page)
            if doc is not None:
@@ -242,10 +249,10 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
    def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput:
        # Add time filters
-        formatted_start_time = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
+        formatted_start_time = datetime.fromtimestamp(start, tz=self.timezone).strftime(
            "%Y-%m-%d %H:%M"
        )
-        formatted_end_time = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
+        formatted_end_time = datetime.fromtimestamp(end, tz=self.timezone).strftime(
            "%Y-%m-%d %H:%M"
        )
        self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'"
--- a/backend/danswer/connectors/confluence/onyx_confluence.py
+++ b/backend/danswer/connectors/confluence/onyx_confluence.py
@@ -134,6 +134,32 @@ class OnyxConfluence(Confluence):
        super(OnyxConfluence, self).__init__(url, *args, **kwargs)
        self._wrap_methods()
    def get_current_user(self, expand: str | None = None) -> Any:
        """
        Implements a method that isn't in the third party client.
        Get information about the current user
        :param expand: OPTIONAL expand for get status of user.
                Possible param is "status". Results are "Active, Deactivated"
        :return: Returns the user details
        """
        from atlassian.errors import ApiPermissionError  # type:ignore
        url = "rest/api/user/current"
        params = {}
        if expand:
            params["expand"] = expand
        try:
            response = self.get(url, params=params)
        except HTTPError as e:
            if e.response.status_code == 403:
                raise ApiPermissionError(
                    "The calling user does not have permission", reason=e
                )
            raise
        return response
    def _wrap_methods(self) -> None:
        """
        For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
@@ -306,6 +332,13 @@ def _validate_connector_configuration(
    )
    spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1)
    # uncomment the following for testing
    # the following is an attempt to retrieve the user's timezone
    # Unfornately, all data is returned in UTC regardless of the user's time zone
    # even tho CQL parses incoming times based on the user's time zone
    # space_key = spaces["results"][0]["key"]
    # space_details = confluence_client_with_minimal_retries.cql(f"space.key={space_key}+AND+type=space")
    if not spaces:
        raise RuntimeError(
            f"No spaces found at {wiki_base}! "