Feature/celery beat watchdog (#4534)

* upgrade celery to release version * make the watchdog script more reusable * use constant * code review * catch interrupt --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-08-08 14:02:09 +02:00 · 2025-04-15 15:05:37 -07:00
parent a8cba7abae
commit 2ac41c3719
8 changed files with 171 additions and 2 deletions
--- a/backend/onyx/background/celery/apps/beat.py
+++ b/backend/onyx/background/celery/apps/beat.py
@@ -152,7 +152,10 @@ class DynamicTenantScheduler(PersistentScheduler):
        current_schedule = self.schedule.items()
        # get potential new state
        try:
            beat_multiplier = OnyxRuntime.get_beat_multiplier()
        except Exception:
            beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
        new_schedule = self._generate_schedule(tenant_ids, beat_multiplier)
--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -226,6 +226,16 @@ if not MULTI_TENANT:
                    "queue": OnyxCeleryQueues.MONITORING,
                },
            },
            {
                "name": "celery-beat-heartbeat",
                "task": OnyxCeleryTask.CELERY_BEAT_HEARTBEAT,
                "schedule": timedelta(minutes=1),
                "options": {
                    "priority": OnyxCeleryPriority.HIGHEST,
                    "expires": BEAT_EXPIRES_DEFAULT,
                    "queue": OnyxCeleryQueues.PRIMARY,
                },
            },
        ]
    )
--- a/backend/onyx/background/celery/tasks/shared/tasks.py
+++ b/backend/onyx/background/celery/tasks/shared/tasks.py
@@ -6,6 +6,7 @@ import httpx
 from celery import shared_task
 from celery import Task
 from celery.exceptions import SoftTimeLimitExceeded
 from redis import Redis
 from redis.lock import Lock as RedisLock
 from tenacity import RetryError
@@ -15,6 +16,7 @@ from onyx.background.celery.apps.app_base import task_logger
 from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT
 from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import ONYX_CELERY_BEAT_HEARTBEAT_KEY
 from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryTask
@@ -353,3 +355,16 @@ def cloud_beat_task_generator(
        f"elapsed={time_elapsed:.2f}"
    )
    return True
@shared_task(name=OnyxCeleryTask.CELERY_BEAT_HEARTBEAT, ignore_result=True, bind=True)
 def celery_beat_heartbeat(self: Task, *, tenant_id: str) -> None:
    """When this task runs, it writes a key to Redis with a TTL.
    An external observer can check this key to figure out if the celery beat is still running.
    """
    time_start = time.monotonic()
    r: Redis = get_redis_client()
    r.set(ONYX_CELERY_BEAT_HEARTBEAT_KEY, 1, ex=600)
    time_elapsed = time.monotonic() - time_start
    task_logger.info(f"celery_beat_heartbeat finished: " f"elapsed={time_elapsed:.2f}")
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -425,6 +425,7 @@ class OnyxCeleryTask:
    MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes"
    MONITOR_CELERY_QUEUES = "monitor_celery_queues"
    MONITOR_PROCESS_MEMORY = "monitor_process_memory"
    CELERY_BEAT_HEARTBEAT = "celery_beat_heartbeat"
    KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
    CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
@@ -444,6 +445,9 @@ class OnyxCeleryTask:
    AUTOGENERATE_USAGE_REPORT_TASK = "autogenerate_usage_report_task"
 # this needs to correspond to the matching entry in supervisord
 ONYX_CELERY_BEAT_HEARTBEAT_KEY = "onyx:celery:beat:heartbeat"
 REDIS_SOCKET_KEEPALIVE_OPTIONS = {}
 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3
--- a/backend/onyx/connectors/confluence/connector.py
+++ b/backend/onyx/connectors/confluence/connector.py
@@ -160,6 +160,7 @@ class ConfluenceConnector(
        }
    def set_allow_images(self, value: bool) -> None:
        logger.info(f"Setting allow_images to {value}.")
        self.allow_images = value
    @property
--- a/backend/onyx/utils/supervisord_watchdog.py
+++ b/backend/onyx/utils/supervisord_watchdog.py
@@ -0,0 +1,109 @@
 #!/usr/bin/env python3
 import argparse
 import subprocess
 import time
 from onyx.redis.redis_pool import get_redis_client
 from onyx.utils.logger import setup_logger
 logger = setup_logger()
 MAX_AGE_SECONDS = 900  # how old the heartbeat can be
 CHECK_INTERVAL = 60  # how often to check
 MAX_LOOKUP_FAILURES = 5
 def main(key: str, program: str, conf: str) -> None:
    """This script will restart the watchdog'd supervisord process via supervisorctl.
    This process continually looks up a specific redis key. If it is missing for a
    consecutive number of times and the last successful lookup is more
    than a threshold time, the specified program will be restarted.
    """
    logger.info(f"supervisord_watchdog starting: program={program} conf={conf}")
    r = get_redis_client()
    last_heartbeat = time.monotonic()
    num_lookup_failures = 0
    try:
        while True:
            time.sleep(CHECK_INTERVAL)
            now = time.monotonic()
            # check for the key ... handle any exception gracefully
            try:
                heartbeat = r.exists(key)
            except Exception:
                logger.exception(
                    f"Exception checking for celery beat heartbeat: key={key}."
                )
                continue
            # happy path ... just continue
            if heartbeat:
                logger.debug(f"Key lookup succeeded: key={key}")
                last_heartbeat = time.monotonic()
                num_lookup_failures = 0
                continue
            # if we haven't exceeded the max lookup failures, continue
            num_lookup_failures += 1
            if num_lookup_failures <= MAX_LOOKUP_FAILURES:
                logger.warning(
                    f"Key lookup failed: key={key} "
                    f"lookup_failures={num_lookup_failures} "
                    f"max_lookup_failures={MAX_LOOKUP_FAILURES}"
                )
                continue
            # if we haven't exceeded the max missing key timeout threshold, continue
            elapsed = now - last_heartbeat
            if elapsed <= MAX_AGE_SECONDS:
                logger.warning(
                    f"Key lookup failed: key={key} "
                    f"lookup_failures={num_lookup_failures} "
                    f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
                    f"elapsed={elapsed:.2f} "
                    f"elapsed_threshold={MAX_AGE_SECONDS}"
                )
                continue
            # all conditions have been exceeded ... restart the process
            logger.warning(
                f"Key lookup failure thresholds exceeded - restarting {program}: "
                f"key={key} "
                f"lookup_failures={num_lookup_failures} "
                f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
                f"elapsed={elapsed:.2f} "
                f"elapsed_threshold={MAX_AGE_SECONDS}"
            )
            subprocess.call(["supervisorctl", "-c", conf, "restart", program])
            # reset state so that we properly delay until the next restart
            # instead of continually restarting
            num_lookup_failures = 0
            last_heartbeat = time.monotonic()
    except KeyboardInterrupt:
        logger.info("Caught interrupt, exiting watchdog.")
    logger.info("supervisord_watchdog exiting.")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Supervisord Watchdog")
    parser.add_argument("--key", help="The redis key to watch", required=True)
    parser.add_argument(
        "--program", help="The supervisord program to restart", required=True
    )
    parser.add_argument(
        "--conf", type=str, help="Path to supervisord config file", required=True
    )
    args = parser.parse_args()
    main(args.key, args.program, args.conf)
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -5,7 +5,7 @@ asyncpg==0.27.0
 atlassian-python-api==3.41.16
 beautifulsoup4==4.12.3
 boto3==1.36.23
-celery==5.5.0b4
+celery==5.5.1
 chardet==5.2.0
 dask==2023.8.1
 ddtrace==2.6.5
--- a/backend/supervisord.conf
+++ b/backend/supervisord.conf
@@ -3,6 +3,18 @@ nodaemon=true
 user=root
 logfile=/var/log/supervisord.log
 # region enable supervisorctl usage
 [supervisorctl]
 serverurl=unix:///tmp/supervisor.sock
 [unix_http_server]
 file=/tmp/supervisor.sock
 chmod=0700
 [rpcinterface:supervisor]
 supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
 # endregion enable supervisorctl usage
 # Background jobs that must be run async due to long time to completion
 # NOTE: due to an issue with Celery + SQLAlchemy 
 # (https://github.com/celery/celery/issues/7007#issuecomment-1740139367)
@@ -98,6 +110,20 @@ redirect_stderr=true
 startsecs=10
 stopasgroup=true
 # watchdog to detect and restart the beat in case of inactivity
 # supervisord only restarts the process if it's dead
 # make sure this key matches ONYX_CELERY_BEAT_HEARTBEAT_KEY
 [program:supervisord_watchdog_celery_beat]
 command=python onyx/utils/supervisord_watchdog.py
    --conf /etc/supervisor/conf.d/supervisord.conf
    --key "onyx:celery:beat:heartbeat"
    --program celery_beat
 stdout_logfile=/var/log/supervisord_watchdog_celery_beat.log
 stdout_logfile_maxbytes=16MB
 redirect_stderr=true
 startsecs=10
 stopasgroup=true
 # Listens for Slack messages and responds with answers
 # for all channels that the OnyxBot has been added to.
 # If not setup, this will just fail 5 times and then stop.
@@ -123,6 +149,7 @@ command=tail -qF
    /var/log/celery_worker_user_files_indexing.log
    /var/log/celery_worker_monitoring.log
    /var/log/slack_bot.log
    /var/log/supervisord_watchdog_celery_beat.log
 stdout_logfile=/dev/stdout
 stdout_logfile_maxbytes = 0  # must be set to 0 when stdout_logfile=/dev/stdout
 autorestart=true