mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 20:38:32 +02:00
Bugfix/confluence time zone (#3265)
* RedisLock typing * checkpoint * put in debug logging * improve comments * mypy fixes
This commit is contained in:
@@ -11,6 +11,7 @@ from celery.exceptions import WorkerShutdown
|
|||||||
from celery.states import READY_STATES
|
from celery.states import READY_STATES
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
from celery.worker import strategy # type: ignore
|
from celery.worker import strategy # type: ignore
|
||||||
|
from redis.lock import Lock as RedisLock
|
||||||
from sentry_sdk.integrations.celery import CeleryIntegration
|
from sentry_sdk.integrations.celery import CeleryIntegration
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
@@ -332,16 +333,16 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
|
|||||||
return
|
return
|
||||||
|
|
||||||
logger.info("Releasing primary worker lock.")
|
logger.info("Releasing primary worker lock.")
|
||||||
lock = sender.primary_worker_lock
|
lock: RedisLock = sender.primary_worker_lock
|
||||||
try:
|
try:
|
||||||
if lock.owned():
|
if lock.owned():
|
||||||
try:
|
try:
|
||||||
lock.release()
|
lock.release()
|
||||||
sender.primary_worker_lock = None
|
sender.primary_worker_lock = None
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logger.error(f"Failed to release primary worker lock: {e}")
|
logger.exception("Failed to release primary worker lock")
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logger.error(f"Failed to check if primary worker lock is owned: {e}")
|
logger.exception("Failed to check if primary worker lock is owned")
|
||||||
|
|
||||||
|
|
||||||
def on_setup_logging(
|
def on_setup_logging(
|
||||||
|
@@ -11,6 +11,7 @@ from celery.signals import celeryd_init
|
|||||||
from celery.signals import worker_init
|
from celery.signals import worker_init
|
||||||
from celery.signals import worker_ready
|
from celery.signals import worker_ready
|
||||||
from celery.signals import worker_shutdown
|
from celery.signals import worker_shutdown
|
||||||
|
from redis.lock import Lock as RedisLock
|
||||||
|
|
||||||
import danswer.background.celery.apps.app_base as app_base
|
import danswer.background.celery.apps.app_base as app_base
|
||||||
from danswer.background.celery.apps.app_base import task_logger
|
from danswer.background.celery.apps.app_base import task_logger
|
||||||
@@ -116,7 +117,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
|||||||
# it is planned to use this lock to enforce singleton behavior on the primary
|
# it is planned to use this lock to enforce singleton behavior on the primary
|
||||||
# worker, since the primary worker does redis cleanup on startup, but this isn't
|
# worker, since the primary worker does redis cleanup on startup, but this isn't
|
||||||
# implemented yet.
|
# implemented yet.
|
||||||
lock = r.lock(
|
lock: RedisLock = r.lock(
|
||||||
DanswerRedisLocks.PRIMARY_WORKER,
|
DanswerRedisLocks.PRIMARY_WORKER,
|
||||||
timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
|
timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
|
||||||
)
|
)
|
||||||
@@ -227,7 +228,7 @@ class HubPeriodicTask(bootsteps.StartStopStep):
|
|||||||
if not hasattr(worker, "primary_worker_lock"):
|
if not hasattr(worker, "primary_worker_lock"):
|
||||||
return
|
return
|
||||||
|
|
||||||
lock = worker.primary_worker_lock
|
lock: RedisLock = worker.primary_worker_lock
|
||||||
|
|
||||||
r = get_redis_client(tenant_id=None)
|
r = get_redis_client(tenant_id=None)
|
||||||
|
|
||||||
|
@@ -308,6 +308,22 @@ CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD = int(
|
|||||||
os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
|
os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Due to breakages in the confluence API, the timezone offset must be specified client side
|
||||||
|
# to match the user's specified timezone.
|
||||||
|
|
||||||
|
# The current state of affairs:
|
||||||
|
# CQL queries are parsed in the user's timezone and cannot be specified in UTC
|
||||||
|
# no API retrieves the user's timezone
|
||||||
|
# All data is returned in UTC, so we can't derive the user's timezone from that
|
||||||
|
|
||||||
|
# https://community.developer.atlassian.com/t/confluence-cloud-time-zone-get-via-rest-api/35954/16
|
||||||
|
# https://jira.atlassian.com/browse/CONFCLOUD-69670
|
||||||
|
|
||||||
|
# enter as a floating point offset from UTC in hours (-24 < val < 24)
|
||||||
|
# this will be applied globally, so it probably makes sense to transition this to per
|
||||||
|
# connector as some point.
|
||||||
|
CONFLUENCE_TIMEZONE_OFFSET = float(os.environ.get("CONFLUENCE_TIMEZONE_OFFSET", 1.0))
|
||||||
|
|
||||||
JIRA_CONNECTOR_LABELS_TO_SKIP = [
|
JIRA_CONNECTOR_LABELS_TO_SKIP = [
|
||||||
ignored_tag
|
ignored_tag
|
||||||
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
|
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
|
||||||
|
@@ -1,9 +1,11 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from datetime import timedelta
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
||||||
|
from danswer.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
|
||||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
@@ -69,6 +71,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
# skip it. This is generally used to avoid indexing extra sensitive
|
# skip it. This is generally used to avoid indexing extra sensitive
|
||||||
# pages.
|
# pages.
|
||||||
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
|
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
|
||||||
|
timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
@@ -104,6 +107,8 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
)
|
)
|
||||||
self.cql_label_filter = f" and label not in ({comma_separated_labels})"
|
self.cql_label_filter = f" and label not in ({comma_separated_labels})"
|
||||||
|
|
||||||
|
self.timezone: timezone = timezone(offset=timedelta(hours=timezone_offset))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def confluence_client(self) -> OnyxConfluence:
|
def confluence_client(self) -> OnyxConfluence:
|
||||||
if self._confluence_client is None:
|
if self._confluence_client is None:
|
||||||
@@ -204,12 +209,14 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
confluence_page_ids: list[str] = []
|
confluence_page_ids: list[str] = []
|
||||||
|
|
||||||
page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter
|
page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter
|
||||||
|
logger.debug(f"page_query: {page_query}")
|
||||||
# Fetch pages as Documents
|
# Fetch pages as Documents
|
||||||
for page in self.confluence_client.paginated_cql_retrieval(
|
for page in self.confluence_client.paginated_cql_retrieval(
|
||||||
cql=page_query,
|
cql=page_query,
|
||||||
expand=",".join(_PAGE_EXPANSION_FIELDS),
|
expand=",".join(_PAGE_EXPANSION_FIELDS),
|
||||||
limit=self.batch_size,
|
limit=self.batch_size,
|
||||||
):
|
):
|
||||||
|
logger.debug(f"_fetch_document_batches: {page['id']}")
|
||||||
confluence_page_ids.append(page["id"])
|
confluence_page_ids.append(page["id"])
|
||||||
doc = self._convert_object_to_document(page)
|
doc = self._convert_object_to_document(page)
|
||||||
if doc is not None:
|
if doc is not None:
|
||||||
@@ -242,10 +249,10 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
|
|
||||||
def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput:
|
def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput:
|
||||||
# Add time filters
|
# Add time filters
|
||||||
formatted_start_time = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
|
formatted_start_time = datetime.fromtimestamp(start, tz=self.timezone).strftime(
|
||||||
"%Y-%m-%d %H:%M"
|
"%Y-%m-%d %H:%M"
|
||||||
)
|
)
|
||||||
formatted_end_time = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
|
formatted_end_time = datetime.fromtimestamp(end, tz=self.timezone).strftime(
|
||||||
"%Y-%m-%d %H:%M"
|
"%Y-%m-%d %H:%M"
|
||||||
)
|
)
|
||||||
self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'"
|
self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'"
|
||||||
|
@@ -134,6 +134,32 @@ class OnyxConfluence(Confluence):
|
|||||||
super(OnyxConfluence, self).__init__(url, *args, **kwargs)
|
super(OnyxConfluence, self).__init__(url, *args, **kwargs)
|
||||||
self._wrap_methods()
|
self._wrap_methods()
|
||||||
|
|
||||||
|
def get_current_user(self, expand: str | None = None) -> Any:
|
||||||
|
"""
|
||||||
|
Implements a method that isn't in the third party client.
|
||||||
|
|
||||||
|
Get information about the current user
|
||||||
|
:param expand: OPTIONAL expand for get status of user.
|
||||||
|
Possible param is "status". Results are "Active, Deactivated"
|
||||||
|
:return: Returns the user details
|
||||||
|
"""
|
||||||
|
|
||||||
|
from atlassian.errors import ApiPermissionError # type:ignore
|
||||||
|
|
||||||
|
url = "rest/api/user/current"
|
||||||
|
params = {}
|
||||||
|
if expand:
|
||||||
|
params["expand"] = expand
|
||||||
|
try:
|
||||||
|
response = self.get(url, params=params)
|
||||||
|
except HTTPError as e:
|
||||||
|
if e.response.status_code == 403:
|
||||||
|
raise ApiPermissionError(
|
||||||
|
"The calling user does not have permission", reason=e
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
return response
|
||||||
|
|
||||||
def _wrap_methods(self) -> None:
|
def _wrap_methods(self) -> None:
|
||||||
"""
|
"""
|
||||||
For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
|
For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
|
||||||
@@ -306,6 +332,13 @@ def _validate_connector_configuration(
|
|||||||
)
|
)
|
||||||
spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1)
|
spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1)
|
||||||
|
|
||||||
|
# uncomment the following for testing
|
||||||
|
# the following is an attempt to retrieve the user's timezone
|
||||||
|
# Unfornately, all data is returned in UTC regardless of the user's time zone
|
||||||
|
# even tho CQL parses incoming times based on the user's time zone
|
||||||
|
# space_key = spaces["results"][0]["key"]
|
||||||
|
# space_details = confluence_client_with_minimal_retries.cql(f"space.key={space_key}+AND+type=space")
|
||||||
|
|
||||||
if not spaces:
|
if not spaces:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"No spaces found at {wiki_base}! "
|
f"No spaces found at {wiki_base}! "
|
||||||
|
Reference in New Issue
Block a user