permission sync in cloud and beat expiry adjustment (#3544)

* try fixing exception in cloud * raise beat expiry ... 60 seconds might be starving certain tasks completely * adjust expiry down to 10 min * raise concurrency overflow for indexing worker. * parent pid check * fix comment * fix parent pid check, also actually raise an exception from the task if the spawned task exit status is bad * fix pid check * some cleanup and task wait fixes * review fixes * comment some code so we don't change too many things at once --------- Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app> Co-authored-by: Richard Kuo <rkuo@rkuo.com>
2025-09-09 16:33:10 +02:00 · 2024-12-30 17:05:57 -08:00
parent a83c9b40d5
commit bec0f9fb23
8 changed files with 169 additions and 46 deletions
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -60,7 +60,12 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
-    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency)
+
+    # rkuo: been seeing transient connection exceptions here, so upping the connection count
+    # from just concurrency/concurrency to concurrency/concurrency*2
+    SqlEngine.init_engine(
+        pool_size=sender.concurrency, max_overflow=sender.concurrency * 2
+    )

    app_base.wait_for_redis(sender, **kwargs)
    app_base.wait_for_db(sender, **kwargs)
--- a/backend/onyx/background/celery/configs/indexing.py
+++ b/backend/onyx/background/celery/configs/indexing.py
@@ -19,7 +19,10 @@ task_acks_late = shared_config.task_acks_late
 # Indexing worker specific ... this lets us track the transition to STARTED in redis
 # We don't currently rely on this but it has the potential to be useful and
 # indexing tasks are not high volume
-task_track_started = True
+
+# we don't turn this on yet because celery occasionally runs tasks more than once
+# which means a duplicate run might change the task state unexpectedly
+# task_track_started = True

 worker_concurrency = CELERY_WORKER_INDEXING_CONCURRENCY
 worker_pool = "threads"
--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -4,6 +4,12 @@ from typing import Any
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryTask

+# choosing 15 minutes because it roughly gives us enough time to process many tasks
+# we might be able to reduce this greatly if we can run a unified
+# loop across all tenants rather than tasks per tenant
+
+BEAT_EXPIRES_DEFAULT = 15 * 60  # 15 minutes (in seconds)
+
 # we set expires because it isn't necessary to queue up these tasks
 # it's only important that they run relatively regularly
 tasks_to_schedule = [
@@ -13,7 +19,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=20),
        "options": {
            "priority": OnyxCeleryPriority.HIGH,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
    {
@@ -22,7 +28,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=20),
        "options": {
            "priority": OnyxCeleryPriority.HIGH,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
    {
@@ -31,7 +37,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=15),
        "options": {
            "priority": OnyxCeleryPriority.HIGH,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
    {
@@ -40,7 +46,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=15),
        "options": {
            "priority": OnyxCeleryPriority.HIGH,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
    {
@@ -49,7 +55,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=3600),
        "options": {
            "priority": OnyxCeleryPriority.LOWEST,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
    {
@@ -58,7 +64,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=5),
        "options": {
            "priority": OnyxCeleryPriority.HIGH,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
    {
@@ -67,7 +73,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=30),
        "options": {
            "priority": OnyxCeleryPriority.HIGH,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
    {
@@ -76,7 +82,7 @@ tasks_to_schedule = [
        "schedule": timedelta(seconds=20),
        "options": {
            "priority": OnyxCeleryPriority.HIGH,
-            "expires": 60,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
 ]
--- a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -1,6 +1,8 @@
+import time
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
+from time import sleep
 from uuid import uuid4

 from celery import Celery
@@ -18,6 +20,7 @@ from onyx.access.models import DocExternalAccess
 from onyx.background.celery.apps.app_base import task_logger
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT
+from onyx.configs.constants import CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT
 from onyx.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
 from onyx.configs.constants import DocumentSource
@@ -91,7 +94,7 @@ def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> b
 def check_for_doc_permissions_sync(self: Task, *, tenant_id: str | None) -> None:
    r = get_redis_client(tenant_id=tenant_id)

-    lock_beat = r.lock(
+    lock_beat: RedisLock = r.lock(
        OnyxRedisLocks.CHECK_CONNECTOR_DOC_PERMISSIONS_SYNC_BEAT_LOCK,
        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
    )
@@ -219,6 +222,43 @@ def connector_permission_sync_generator_task(

    r = get_redis_client(tenant_id=tenant_id)

+    # this wait is needed to avoid a race condition where
+    # the primary worker sends the task and it is immediately executed
+    # before the primary worker can finalize the fence
+    start = time.monotonic()
+    while True:
+        if time.monotonic() - start > CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT:
+            raise ValueError(
+                f"connector_permission_sync_generator_task - timed out waiting for fence to be ready: "
+                f"fence={redis_connector.permissions.fence_key}"
+            )
+
+        if not redis_connector.permissions.fenced:  # The fence must exist
+            raise ValueError(
+                f"connector_permission_sync_generator_task - fence not found: "
+                f"fence={redis_connector.permissions.fence_key}"
+            )
+
+        payload = redis_connector.permissions.payload  # The payload must exist
+        if not payload:
+            raise ValueError(
+                "connector_permission_sync_generator_task: payload invalid or not found"
+            )
+
+        if payload.celery_task_id is None:
+            logger.info(
+                f"connector_permission_sync_generator_task - Waiting for fence: "
+                f"fence={redis_connector.permissions.fence_key}"
+            )
+            sleep(1)
+            continue
+
+        logger.info(
+            f"connector_permission_sync_generator_task - Fence found, continuing...: "
+            f"fence={redis_connector.permissions.fence_key}"
+        )
+        break
+
    lock: RedisLock = r.lock(
        OnyxRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX
        + f"_{redis_connector.id}",
@@ -254,8 +294,11 @@ def connector_permission_sync_generator_task(
            if not payload:
                raise ValueError(f"No fence payload found: cc_pair={cc_pair_id}")

-            payload.started = datetime.now(timezone.utc)
-            redis_connector.permissions.set_fence(payload)
+            new_payload = RedisConnectorPermissionSyncPayload(
+                started=datetime.now(timezone.utc),
+                celery_task_id=payload.celery_task_id,
+            )
+            redis_connector.permissions.set_fence(new_payload)

            document_external_accesses: list[DocExternalAccess] = doc_sync_func(cc_pair)

--- a/backend/onyx/background/celery/tasks/external_group_syncing/tasks.py
+++ b/backend/onyx/background/celery/tasks/external_group_syncing/tasks.py
@@ -97,7 +97,7 @@ def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
 def check_for_external_group_sync(self: Task, *, tenant_id: str | None) -> None:
    r = get_redis_client(tenant_id=tenant_id)

-    lock_beat = r.lock(
+    lock_beat: RedisLock = r.lock(
        OnyxRedisLocks.CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK,
        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
    )
@@ -162,7 +162,7 @@ def try_creating_external_group_sync_task(

    LOCK_TIMEOUT = 30

-    lock = r.lock(
+    lock: RedisLock = r.lock(
        DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_generate_external_group_sync_tasks",
        timeout=LOCK_TIMEOUT,
    )
--- a/backend/onyx/background/celery/tasks/indexing/tasks.py
+++ b/backend/onyx/background/celery/tasks/indexing/tasks.py
@@ -1,3 +1,5 @@
+import os
+import sys
 import time
 from datetime import datetime
 from datetime import timezone
@@ -23,6 +25,7 @@ from onyx.background.indexing.job_client import SimpleJobClient
 from onyx.background.indexing.run_indexing import run_indexing_entrypoint
 from onyx.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
 from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
+from onyx.configs.constants import CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT
 from onyx.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
 from onyx.configs.constants import DocumentSource
@@ -71,14 +74,18 @@ logger = setup_logger()


 class IndexingCallback(IndexingHeartbeatInterface):
+    PARENT_CHECK_INTERVAL = 60
+
    def __init__(
        self,
+        parent_pid: int,
        stop_key: str,
        generator_progress_key: str,
        redis_lock: RedisLock,
        redis_client: Redis,
    ):
        super().__init__()
+        self.parent_pid = parent_pid
        self.redis_lock: RedisLock = redis_lock
        self.stop_key: str = stop_key
        self.generator_progress_key: str = generator_progress_key
@@ -89,12 +96,31 @@ class IndexingCallback(IndexingHeartbeatInterface):
        self.last_tag: str = "IndexingCallback.__init__"
        self.last_lock_reacquire: datetime = datetime.now(timezone.utc)

+        self.last_parent_check = time.monotonic()
+
    def should_stop(self) -> bool:
        if self.redis_client.exists(self.stop_key):
            return True
+
        return False

    def progress(self, tag: str, amount: int) -> None:
+        # rkuo: this shouldn't be necessary yet because we spawn the process this runs inside
+        # with daemon = True. It seems likely some indexing tasks will need to spawn other processes eventually
+        # so leave this code in until we're ready to test it.
+
+        # if self.parent_pid:
+        #     # check if the parent pid is alive so we aren't running as a zombie
+        #     now = time.monotonic()
+        #     if now - self.last_parent_check > IndexingCallback.PARENT_CHECK_INTERVAL:
+        #         try:
+        #             # this is unintuitive, but it checks if the parent pid is still running
+        #             os.kill(self.parent_pid, 0)
+        #         except Exception:
+        #             logger.exception("IndexingCallback - parent pid check exceptioned")
+        #             raise
+        #         self.last_parent_check = now
+
        try:
            self.redis_lock.reacquire()
            self.last_tag = tag
@@ -772,7 +798,6 @@ def connector_indexing_proxy_task(
        return

    task_logger.info(
-        f"Indexing proxy - spawn succeeded: attempt={index_attempt_id} "
        f"Indexing watchdog - spawn succeeded: attempt={index_attempt_id} "
        f"cc_pair={cc_pair_id} "
        f"search_settings={search_settings_id}"
@@ -789,23 +814,26 @@ def connector_indexing_proxy_task(

        # if the job is done, clean up and break
        if job.done():
-            if job.status == "error":
-                ignore_exitcode = False
+            try:
+                if job.status == "error":
+                    ignore_exitcode = False

-                exit_code: int | None = None
-                if job.process:
-                    exit_code = job.process.exitcode
+                    exit_code: int | None = None
+                    if job.process:
+                        exit_code = job.process.exitcode

-                # seeing odd behavior where spawned tasks usually return exit code 1 in the cloud,
-                # even though logging clearly indicates that they completed successfully
-                # to work around this, we ignore the job error state if the completion signal is OK
-                status_int = redis_connector_index.get_completion()
-                if status_int:
-                    status_enum = HTTPStatus(status_int)
-                    if status_enum == HTTPStatus.OK:
-                        ignore_exitcode = True
+                    # seeing odd behavior where spawned tasks usually return exit code 1 in the cloud,
+                    # even though logging clearly indicates successful completion
+                    # to work around this, we ignore the job error state if the completion signal is OK
+                    status_int = redis_connector_index.get_completion()
+                    if status_int:
+                        status_enum = HTTPStatus(status_int)
+                        if status_enum == HTTPStatus.OK:
+                            ignore_exitcode = True
+
+                    if not ignore_exitcode:
+                        raise RuntimeError("Spawned task exceptioned.")

-                if ignore_exitcode:
                    task_logger.warning(
                        "Indexing watchdog - spawned task has non-zero exit code "
                        "but completion signal is OK. Continuing...: "
@@ -815,18 +843,21 @@ def connector_indexing_proxy_task(
                        f"search_settings={search_settings_id} "
                        f"exit_code={exit_code}"
                    )
-                else:
-                    task_logger.error(
-                        "Indexing watchdog - spawned task exceptioned: "
-                        f"attempt={index_attempt_id} "
-                        f"tenant={tenant_id} "
-                        f"cc_pair={cc_pair_id} "
-                        f"search_settings={search_settings_id} "
-                        f"exit_code={exit_code} "
-                        f"error={job.exception()}"
-                    )
+            except Exception:
+                task_logger.error(
+                    "Indexing watchdog - spawned task exceptioned: "
+                    f"attempt={index_attempt_id} "
+                    f"tenant={tenant_id} "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={search_settings_id} "
+                    f"exit_code={exit_code} "
+                    f"error={job.exception()}"
+                )
+
+                raise
+            finally:
+                job.release()

-            job.release()
            break

        # if a termination signal is detected, clean up and break
@@ -911,7 +942,7 @@ def connector_indexing_task_wrapper(
            tenant_id,
            is_ee,
        )
-    except:
+    except Exception:
        logger.exception(
            f"connector_indexing_task exceptioned: "
            f"tenant={tenant_id} "
@@ -919,7 +950,14 @@ def connector_indexing_task_wrapper(
            f"cc_pair={cc_pair_id} "
            f"search_settings={search_settings_id}"
        )
-        raise
+
+        # There is a cloud related bug outside of our code
+        # where spawned tasks return with an exit code of 1.
+        # Unfortunately, exceptions also return with an exit code of 1,
+        # so just raising an exception isn't informative
+        # Exiting with 255 makes it possible to distinguish between normal exits
+        # and exceptions.
+        sys.exit(255)

    return result

@@ -991,7 +1029,17 @@ def connector_indexing_task(
            f"fence={redis_connector.stop.fence_key}"
        )

+    # this wait is needed to avoid a race condition where
+    # the primary worker sends the task and it is immediately executed
+    # before the primary worker can finalize the fence
+    start = time.monotonic()
    while True:
+        if time.monotonic() - start > CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT:
+            raise ValueError(
+                f"connector_indexing_task - timed out waiting for fence to be ready: "
+                f"fence={redis_connector.permissions.fence_key}"
+            )
+
        if not redis_connector_index.fenced:  # The fence must exist
            raise ValueError(
                f"connector_indexing_task - fence not found: fence={redis_connector_index.fence_key}"
@@ -1032,7 +1080,9 @@ def connector_indexing_task(
    if not acquired:
        logger.warning(
            f"Indexing task already running, exiting...: "
-            f"index_attempt={index_attempt_id} cc_pair={cc_pair_id} search_settings={search_settings_id}"
+            f"index_attempt={index_attempt_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id}"
        )
        return None

@@ -1068,6 +1118,7 @@ def connector_indexing_task(

        # define a callback class
        callback = IndexingCallback(
+            os.getppid(),
            redis_connector.stop.fence_key,
            redis_connector_index.generator_progress_key,
            lock,
@@ -1101,8 +1152,19 @@ def connector_indexing_task(
            f"search_settings={search_settings_id}"
        )
        if attempt_found:
-            with get_session_with_tenant(tenant_id) as db_session:
-                mark_attempt_failed(index_attempt_id, db_session, failure_reason=str(e))
+            try:
+                with get_session_with_tenant(tenant_id) as db_session:
+                    mark_attempt_failed(
+                        index_attempt_id, db_session, failure_reason=str(e)
+                    )
+            except Exception:
+                logger.exception(
+                    "Indexing watchdog - transient exception looking up index attempt: "
+                    f"attempt={index_attempt_id} "
+                    f"tenant={tenant_id} "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={search_settings_id}"
+                )

        raise e
    finally:
--- a/backend/onyx/background/celery/tasks/pruning/tasks.py
+++ b/backend/onyx/background/celery/tasks/pruning/tasks.py
@@ -283,6 +283,7 @@ def connector_pruning_generator_task(
            )

            callback = IndexingCallback(
+                0,
                redis_connector.stop.fence_key,
                redis_connector.prune.generator_progress_key,
                lock,
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -83,6 +83,9 @@ CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120
 # if we can get callbacks as object bytes download, we could lower this a lot.
 CELERY_INDEXING_LOCK_TIMEOUT = 3 * 60 * 60  # 60 min

+# how long a task should wait for associated fence to be ready
+CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT = 5 * 60  # 5 min
+
 # needs to be long enough to cover the maximum time it takes to download an object
 # if we can get callbacks as object bytes download, we could lower this a lot.
 CELERY_PRUNING_LOCK_TIMEOUT = 300  # 5 min