Feature/celery beat watchdog (#4534)

* upgrade celery to release version * make the watchdog script more reusable * use constant * code review * catch interrupt --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-10-04 12:58:42 +02:00 · 2025-04-15 15:05:37 -07:00
parent a8cba7abae
commit 2ac41c3719
8 changed files with 171 additions and 2 deletions
--- a/backend/onyx/background/celery/apps/beat.py
+++ b/backend/onyx/background/celery/apps/beat.py
@@ -152,7 +152,10 @@ class DynamicTenantScheduler(PersistentScheduler):
        current_schedule = self.schedule.items()

        # get potential new state
-        beat_multiplier = OnyxRuntime.get_beat_multiplier()
+        try:
+            beat_multiplier = OnyxRuntime.get_beat_multiplier()
+        except Exception:
+            beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT

        new_schedule = self._generate_schedule(tenant_ids, beat_multiplier)

--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -226,6 +226,16 @@ if not MULTI_TENANT:
                    "queue": OnyxCeleryQueues.MONITORING,
                },
            },
+            {
+                "name": "celery-beat-heartbeat",
+                "task": OnyxCeleryTask.CELERY_BEAT_HEARTBEAT,
+                "schedule": timedelta(minutes=1),
+                "options": {
+                    "priority": OnyxCeleryPriority.HIGHEST,
+                    "expires": BEAT_EXPIRES_DEFAULT,
+                    "queue": OnyxCeleryQueues.PRIMARY,
+                },
+            },
        ]
    )

--- a/backend/onyx/background/celery/tasks/shared/tasks.py
+++ b/backend/onyx/background/celery/tasks/shared/tasks.py
@@ -6,6 +6,7 @@ import httpx
 from celery import shared_task
 from celery import Task
 from celery.exceptions import SoftTimeLimitExceeded
+from redis import Redis
 from redis.lock import Lock as RedisLock
 from tenacity import RetryError

@@ -15,6 +16,7 @@ from onyx.background.celery.apps.app_base import task_logger
 from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT
 from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
+from onyx.configs.constants import ONYX_CELERY_BEAT_HEARTBEAT_KEY
 from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryTask
@@ -353,3 +355,16 @@ def cloud_beat_task_generator(
        f"elapsed={time_elapsed:.2f}"
    )
    return True
+
+
+@shared_task(name=OnyxCeleryTask.CELERY_BEAT_HEARTBEAT, ignore_result=True, bind=True)
+def celery_beat_heartbeat(self: Task, *, tenant_id: str) -> None:
+    """When this task runs, it writes a key to Redis with a TTL.
+
+    An external observer can check this key to figure out if the celery beat is still running.
+    """
+    time_start = time.monotonic()
+    r: Redis = get_redis_client()
+    r.set(ONYX_CELERY_BEAT_HEARTBEAT_KEY, 1, ex=600)
+    time_elapsed = time.monotonic() - time_start
+    task_logger.info(f"celery_beat_heartbeat finished: " f"elapsed={time_elapsed:.2f}")
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -425,6 +425,7 @@ class OnyxCeleryTask:
    MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes"
    MONITOR_CELERY_QUEUES = "monitor_celery_queues"
    MONITOR_PROCESS_MEMORY = "monitor_process_memory"
+    CELERY_BEAT_HEARTBEAT = "celery_beat_heartbeat"

    KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
    CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
@@ -444,6 +445,9 @@ class OnyxCeleryTask:
    AUTOGENERATE_USAGE_REPORT_TASK = "autogenerate_usage_report_task"


+# this needs to correspond to the matching entry in supervisord
+ONYX_CELERY_BEAT_HEARTBEAT_KEY = "onyx:celery:beat:heartbeat"
+
 REDIS_SOCKET_KEEPALIVE_OPTIONS = {}
 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3
--- a/backend/onyx/connectors/confluence/connector.py
+++ b/backend/onyx/connectors/confluence/connector.py
@@ -160,6 +160,7 @@ class ConfluenceConnector(
        }

    def set_allow_images(self, value: bool) -> None:
+        logger.info(f"Setting allow_images to {value}.")
        self.allow_images = value

    @property
--- a/backend/onyx/utils/supervisord_watchdog.py
+++ b/backend/onyx/utils/supervisord_watchdog.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+import argparse
+import subprocess
+import time
+
+from onyx.redis.redis_pool import get_redis_client
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+MAX_AGE_SECONDS = 900  # how old the heartbeat can be
+CHECK_INTERVAL = 60  # how often to check
+MAX_LOOKUP_FAILURES = 5
+
+
+def main(key: str, program: str, conf: str) -> None:
+    """This script will restart the watchdog'd supervisord process via supervisorctl.
+
+    This process continually looks up a specific redis key. If it is missing for a
+    consecutive number of times and the last successful lookup is more
+    than a threshold time, the specified program will be restarted.
+    """
+    logger.info(f"supervisord_watchdog starting: program={program} conf={conf}")
+
+    r = get_redis_client()
+
+    last_heartbeat = time.monotonic()
+    num_lookup_failures = 0
+
+    try:
+        while True:
+            time.sleep(CHECK_INTERVAL)
+
+            now = time.monotonic()
+
+            # check for the key ... handle any exception gracefully
+            try:
+                heartbeat = r.exists(key)
+            except Exception:
+                logger.exception(
+                    f"Exception checking for celery beat heartbeat: key={key}."
+                )
+                continue
+
+            # happy path ... just continue
+            if heartbeat:
+                logger.debug(f"Key lookup succeeded: key={key}")
+                last_heartbeat = time.monotonic()
+                num_lookup_failures = 0
+                continue
+
+            # if we haven't exceeded the max lookup failures, continue
+            num_lookup_failures += 1
+            if num_lookup_failures <= MAX_LOOKUP_FAILURES:
+                logger.warning(
+                    f"Key lookup failed: key={key} "
+                    f"lookup_failures={num_lookup_failures} "
+                    f"max_lookup_failures={MAX_LOOKUP_FAILURES}"
+                )
+                continue
+
+            # if we haven't exceeded the max missing key timeout threshold, continue
+            elapsed = now - last_heartbeat
+            if elapsed <= MAX_AGE_SECONDS:
+                logger.warning(
+                    f"Key lookup failed: key={key} "
+                    f"lookup_failures={num_lookup_failures} "
+                    f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
+                    f"elapsed={elapsed:.2f} "
+                    f"elapsed_threshold={MAX_AGE_SECONDS}"
+                )
+                continue
+
+            # all conditions have been exceeded ... restart the process
+            logger.warning(
+                f"Key lookup failure thresholds exceeded - restarting {program}: "
+                f"key={key} "
+                f"lookup_failures={num_lookup_failures} "
+                f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
+                f"elapsed={elapsed:.2f} "
+                f"elapsed_threshold={MAX_AGE_SECONDS}"
+            )
+
+            subprocess.call(["supervisorctl", "-c", conf, "restart", program])
+
+            # reset state so that we properly delay until the next restart
+            # instead of continually restarting
+            num_lookup_failures = 0
+            last_heartbeat = time.monotonic()
+    except KeyboardInterrupt:
+        logger.info("Caught interrupt, exiting watchdog.")
+
+    logger.info("supervisord_watchdog exiting.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Supervisord Watchdog")
+    parser.add_argument("--key", help="The redis key to watch", required=True)
+    parser.add_argument(
+        "--program", help="The supervisord program to restart", required=True
+    )
+    parser.add_argument(
+        "--conf", type=str, help="Path to supervisord config file", required=True
+    )
+    args = parser.parse_args()
+
+    main(args.key, args.program, args.conf)
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -5,7 +5,7 @@ asyncpg==0.27.0
 atlassian-python-api==3.41.16
 beautifulsoup4==4.12.3
 boto3==1.36.23
-celery==5.5.0b4
+celery==5.5.1
 chardet==5.2.0
 dask==2023.8.1
 ddtrace==2.6.5
--- a/backend/supervisord.conf
+++ b/backend/supervisord.conf
@@ -3,6 +3,18 @@ nodaemon=true
 user=root
 logfile=/var/log/supervisord.log

+# region enable supervisorctl usage
+[supervisorctl]
+serverurl=unix:///tmp/supervisor.sock
+
+[unix_http_server]
+file=/tmp/supervisor.sock
+chmod=0700
+
+[rpcinterface:supervisor]
+supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
+# endregion enable supervisorctl usage
+
 # Background jobs that must be run async due to long time to completion
 # NOTE: due to an issue with Celery + SQLAlchemy 
 # (https://github.com/celery/celery/issues/7007#issuecomment-1740139367)
@@ -98,6 +110,20 @@ redirect_stderr=true
 startsecs=10
 stopasgroup=true

+# watchdog to detect and restart the beat in case of inactivity
+# supervisord only restarts the process if it's dead
+# make sure this key matches ONYX_CELERY_BEAT_HEARTBEAT_KEY
+[program:supervisord_watchdog_celery_beat]
+command=python onyx/utils/supervisord_watchdog.py
+    --conf /etc/supervisor/conf.d/supervisord.conf
+    --key "onyx:celery:beat:heartbeat"
+    --program celery_beat
+stdout_logfile=/var/log/supervisord_watchdog_celery_beat.log
+stdout_logfile_maxbytes=16MB
+redirect_stderr=true
+startsecs=10
+stopasgroup=true
+
 # Listens for Slack messages and responds with answers
 # for all channels that the OnyxBot has been added to.
 # If not setup, this will just fail 5 times and then stop.
@@ -123,6 +149,7 @@ command=tail -qF
    /var/log/celery_worker_user_files_indexing.log
    /var/log/celery_worker_monitoring.log
    /var/log/slack_bot.log
+    /var/log/supervisord_watchdog_celery_beat.log
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes = 0  # must be set to 0 when stdout_logfile=/dev/stdout
 autorestart=true