From 5ed83f114839c9ba3abe53dba93c53165329cbd7 Mon Sep 17 00:00:00 2001 From: rkuo-danswer Date: Wed, 12 Feb 2025 22:31:01 -0800 Subject: [PATCH] =?UTF-8?q?no=20thread=20local=20locks=20in=20callbacks=20?= =?UTF-8?q?and=20raise=20permission=20sync=20timeout=20=E2=80=A6=20(#3977)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * no thread local locks in callbacks and raise permission sync timeout by a lot based on empirical log observations * more fixes --------- Co-authored-by: Richard Kuo (Danswer) --- .../background/celery/tasks/doc_permission_syncing/tasks.py | 1 + backend/onyx/background/celery/tasks/vespa/tasks.py | 3 +-- backend/onyx/configs/constants.py | 4 ++-- backend/onyx/redis/redis_connector_doc_perm_sync.py | 3 ++- backend/onyx/redis/redis_connector_prune.py | 3 ++- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py index a7ce5026a..baa523247 100644 --- a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py +++ b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py @@ -367,6 +367,7 @@ def connector_permission_sync_generator_task( OnyxRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX + f"_{redis_connector.id}", timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT, + thread_local=False, ) acquired = lock.acquire(blocking=False) diff --git a/backend/onyx/background/celery/tasks/vespa/tasks.py b/backend/onyx/background/celery/tasks/vespa/tasks.py index 08f11212a..d033f9e92 100644 --- a/backend/onyx/background/celery/tasks/vespa/tasks.py +++ b/backend/onyx/background/celery/tasks/vespa/tasks.py @@ -120,9 +120,8 @@ def check_for_vespa_sync_task(self: Task, *, tenant_id: str | None) -> bool | No # endregion # check if any user groups are not synced + lock_beat.reacquire() if global_version.is_ee_version(): - lock_beat.reacquire() - try: fetch_user_groups = fetch_versioned_implementation( "onyx.db.user_group", "fetch_user_groups" diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py index 8102af10c..a2d951365 100644 --- a/backend/onyx/configs/constants.py +++ b/backend/onyx/configs/constants.py @@ -107,9 +107,9 @@ CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT = 5 * 60 # 5 min # needs to be long enough to cover the maximum time it takes to download an object # if we can get callbacks as object bytes download, we could lower this a lot. -CELERY_PRUNING_LOCK_TIMEOUT = 300 # 5 min +CELERY_PRUNING_LOCK_TIMEOUT = 3600 # 1 hour (in seconds) -CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT = 300 # 5 min +CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT = 3600 # 1 hour (in seconds) CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT = 300 # 5 min diff --git a/backend/onyx/redis/redis_connector_doc_perm_sync.py b/backend/onyx/redis/redis_connector_doc_perm_sync.py index 70b9b4011..cc2716676 100644 --- a/backend/onyx/redis/redis_connector_doc_perm_sync.py +++ b/backend/onyx/redis/redis_connector_doc_perm_sync.py @@ -11,6 +11,7 @@ from redis.lock import Lock as RedisLock from onyx.access.models import DocExternalAccess from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT +from onyx.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT from onyx.configs.constants import OnyxCeleryPriority from onyx.configs.constants import OnyxCeleryQueues from onyx.configs.constants import OnyxCeleryTask @@ -49,7 +50,7 @@ class RedisConnectorPermissionSync: # it's impossible to get the exact state of the system at a single point in time # so we need a signal with a TTL to bridge gaps in our checks ACTIVE_PREFIX = PREFIX + "_active" - ACTIVE_TTL = 3600 + ACTIVE_TTL = CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT * 2 def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None: self.tenant_id: str | None = tenant_id diff --git a/backend/onyx/redis/redis_connector_prune.py b/backend/onyx/redis/redis_connector_prune.py index 089dbbfdf..df401df5a 100644 --- a/backend/onyx/redis/redis_connector_prune.py +++ b/backend/onyx/redis/redis_connector_prune.py @@ -10,6 +10,7 @@ from redis.lock import Lock as RedisLock from sqlalchemy.orm import Session from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT +from onyx.configs.constants import CELERY_PRUNING_LOCK_TIMEOUT from onyx.configs.constants import OnyxCeleryPriority from onyx.configs.constants import OnyxCeleryQueues from onyx.configs.constants import OnyxCeleryTask @@ -49,7 +50,7 @@ class RedisConnectorPrune: # it's impossible to get the exact state of the system at a single point in time # so we need a signal with a TTL to bridge gaps in our checks ACTIVE_PREFIX = PREFIX + "_active" - ACTIVE_TTL = 3600 + ACTIVE_TTL = CELERY_PRUNING_LOCK_TIMEOUT * 2 def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None: self.tenant_id: str | None = tenant_id