Feature/celery beat watchdog (#4534)

* upgrade celery to release version

* make the watchdog script more reusable

* use constant

* code review

* catch interrupt

---------

Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer
2025-04-15 15:05:37 -07:00
committed by GitHub
parent a8cba7abae
commit 2ac41c3719
8 changed files with 171 additions and 2 deletions

View File

@@ -152,7 +152,10 @@ class DynamicTenantScheduler(PersistentScheduler):
current_schedule = self.schedule.items() current_schedule = self.schedule.items()
# get potential new state # get potential new state
try:
beat_multiplier = OnyxRuntime.get_beat_multiplier() beat_multiplier = OnyxRuntime.get_beat_multiplier()
except Exception:
beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
new_schedule = self._generate_schedule(tenant_ids, beat_multiplier) new_schedule = self._generate_schedule(tenant_ids, beat_multiplier)

View File

@@ -226,6 +226,16 @@ if not MULTI_TENANT:
"queue": OnyxCeleryQueues.MONITORING, "queue": OnyxCeleryQueues.MONITORING,
}, },
}, },
{
"name": "celery-beat-heartbeat",
"task": OnyxCeleryTask.CELERY_BEAT_HEARTBEAT,
"schedule": timedelta(minutes=1),
"options": {
"priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT,
"queue": OnyxCeleryQueues.PRIMARY,
},
},
] ]
) )

View File

@@ -6,6 +6,7 @@ import httpx
from celery import shared_task from celery import shared_task
from celery import Task from celery import Task
from celery.exceptions import SoftTimeLimitExceeded from celery.exceptions import SoftTimeLimitExceeded
from redis import Redis
from redis.lock import Lock as RedisLock from redis.lock import Lock as RedisLock
from tenacity import RetryError from tenacity import RetryError
@@ -15,6 +16,7 @@ from onyx.background.celery.apps.app_base import task_logger
from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT
from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
from onyx.configs.constants import ONYX_CELERY_BEAT_HEARTBEAT_KEY
from onyx.configs.constants import ONYX_CLOUD_TENANT_ID from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
from onyx.configs.constants import OnyxCeleryPriority from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryTask from onyx.configs.constants import OnyxCeleryTask
@@ -353,3 +355,16 @@ def cloud_beat_task_generator(
f"elapsed={time_elapsed:.2f}" f"elapsed={time_elapsed:.2f}"
) )
return True return True
@shared_task(name=OnyxCeleryTask.CELERY_BEAT_HEARTBEAT, ignore_result=True, bind=True)
def celery_beat_heartbeat(self: Task, *, tenant_id: str) -> None:
"""When this task runs, it writes a key to Redis with a TTL.
An external observer can check this key to figure out if the celery beat is still running.
"""
time_start = time.monotonic()
r: Redis = get_redis_client()
r.set(ONYX_CELERY_BEAT_HEARTBEAT_KEY, 1, ex=600)
time_elapsed = time.monotonic() - time_start
task_logger.info(f"celery_beat_heartbeat finished: " f"elapsed={time_elapsed:.2f}")

View File

@@ -425,6 +425,7 @@ class OnyxCeleryTask:
MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes" MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes"
MONITOR_CELERY_QUEUES = "monitor_celery_queues" MONITOR_CELERY_QUEUES = "monitor_celery_queues"
MONITOR_PROCESS_MEMORY = "monitor_process_memory" MONITOR_PROCESS_MEMORY = "monitor_process_memory"
CELERY_BEAT_HEARTBEAT = "celery_beat_heartbeat"
KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task" KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = ( CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
@@ -444,6 +445,9 @@ class OnyxCeleryTask:
AUTOGENERATE_USAGE_REPORT_TASK = "autogenerate_usage_report_task" AUTOGENERATE_USAGE_REPORT_TASK = "autogenerate_usage_report_task"
# this needs to correspond to the matching entry in supervisord
ONYX_CELERY_BEAT_HEARTBEAT_KEY = "onyx:celery:beat:heartbeat"
REDIS_SOCKET_KEEPALIVE_OPTIONS = {} REDIS_SOCKET_KEEPALIVE_OPTIONS = {}
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3

View File

@@ -160,6 +160,7 @@ class ConfluenceConnector(
} }
def set_allow_images(self, value: bool) -> None: def set_allow_images(self, value: bool) -> None:
logger.info(f"Setting allow_images to {value}.")
self.allow_images = value self.allow_images = value
@property @property

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
import argparse
import subprocess
import time
from onyx.redis.redis_pool import get_redis_client
from onyx.utils.logger import setup_logger
logger = setup_logger()
MAX_AGE_SECONDS = 900 # how old the heartbeat can be
CHECK_INTERVAL = 60 # how often to check
MAX_LOOKUP_FAILURES = 5
def main(key: str, program: str, conf: str) -> None:
"""This script will restart the watchdog'd supervisord process via supervisorctl.
This process continually looks up a specific redis key. If it is missing for a
consecutive number of times and the last successful lookup is more
than a threshold time, the specified program will be restarted.
"""
logger.info(f"supervisord_watchdog starting: program={program} conf={conf}")
r = get_redis_client()
last_heartbeat = time.monotonic()
num_lookup_failures = 0
try:
while True:
time.sleep(CHECK_INTERVAL)
now = time.monotonic()
# check for the key ... handle any exception gracefully
try:
heartbeat = r.exists(key)
except Exception:
logger.exception(
f"Exception checking for celery beat heartbeat: key={key}."
)
continue
# happy path ... just continue
if heartbeat:
logger.debug(f"Key lookup succeeded: key={key}")
last_heartbeat = time.monotonic()
num_lookup_failures = 0
continue
# if we haven't exceeded the max lookup failures, continue
num_lookup_failures += 1
if num_lookup_failures <= MAX_LOOKUP_FAILURES:
logger.warning(
f"Key lookup failed: key={key} "
f"lookup_failures={num_lookup_failures} "
f"max_lookup_failures={MAX_LOOKUP_FAILURES}"
)
continue
# if we haven't exceeded the max missing key timeout threshold, continue
elapsed = now - last_heartbeat
if elapsed <= MAX_AGE_SECONDS:
logger.warning(
f"Key lookup failed: key={key} "
f"lookup_failures={num_lookup_failures} "
f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
f"elapsed={elapsed:.2f} "
f"elapsed_threshold={MAX_AGE_SECONDS}"
)
continue
# all conditions have been exceeded ... restart the process
logger.warning(
f"Key lookup failure thresholds exceeded - restarting {program}: "
f"key={key} "
f"lookup_failures={num_lookup_failures} "
f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
f"elapsed={elapsed:.2f} "
f"elapsed_threshold={MAX_AGE_SECONDS}"
)
subprocess.call(["supervisorctl", "-c", conf, "restart", program])
# reset state so that we properly delay until the next restart
# instead of continually restarting
num_lookup_failures = 0
last_heartbeat = time.monotonic()
except KeyboardInterrupt:
logger.info("Caught interrupt, exiting watchdog.")
logger.info("supervisord_watchdog exiting.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Supervisord Watchdog")
parser.add_argument("--key", help="The redis key to watch", required=True)
parser.add_argument(
"--program", help="The supervisord program to restart", required=True
)
parser.add_argument(
"--conf", type=str, help="Path to supervisord config file", required=True
)
args = parser.parse_args()
main(args.key, args.program, args.conf)

View File

@@ -5,7 +5,7 @@ asyncpg==0.27.0
atlassian-python-api==3.41.16 atlassian-python-api==3.41.16
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
boto3==1.36.23 boto3==1.36.23
celery==5.5.0b4 celery==5.5.1
chardet==5.2.0 chardet==5.2.0
dask==2023.8.1 dask==2023.8.1
ddtrace==2.6.5 ddtrace==2.6.5

View File

@@ -3,6 +3,18 @@ nodaemon=true
user=root user=root
logfile=/var/log/supervisord.log logfile=/var/log/supervisord.log
# region enable supervisorctl usage
[supervisorctl]
serverurl=unix:///tmp/supervisor.sock
[unix_http_server]
file=/tmp/supervisor.sock
chmod=0700
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
# endregion enable supervisorctl usage
# Background jobs that must be run async due to long time to completion # Background jobs that must be run async due to long time to completion
# NOTE: due to an issue with Celery + SQLAlchemy # NOTE: due to an issue with Celery + SQLAlchemy
# (https://github.com/celery/celery/issues/7007#issuecomment-1740139367) # (https://github.com/celery/celery/issues/7007#issuecomment-1740139367)
@@ -98,6 +110,20 @@ redirect_stderr=true
startsecs=10 startsecs=10
stopasgroup=true stopasgroup=true
# watchdog to detect and restart the beat in case of inactivity
# supervisord only restarts the process if it's dead
# make sure this key matches ONYX_CELERY_BEAT_HEARTBEAT_KEY
[program:supervisord_watchdog_celery_beat]
command=python onyx/utils/supervisord_watchdog.py
--conf /etc/supervisor/conf.d/supervisord.conf
--key "onyx:celery:beat:heartbeat"
--program celery_beat
stdout_logfile=/var/log/supervisord_watchdog_celery_beat.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
startsecs=10
stopasgroup=true
# Listens for Slack messages and responds with answers # Listens for Slack messages and responds with answers
# for all channels that the OnyxBot has been added to. # for all channels that the OnyxBot has been added to.
# If not setup, this will just fail 5 times and then stop. # If not setup, this will just fail 5 times and then stop.
@@ -123,6 +149,7 @@ command=tail -qF
/var/log/celery_worker_user_files_indexing.log /var/log/celery_worker_user_files_indexing.log
/var/log/celery_worker_monitoring.log /var/log/celery_worker_monitoring.log
/var/log/slack_bot.log /var/log/slack_bot.log
/var/log/supervisord_watchdog_celery_beat.log
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
stdout_logfile_maxbytes = 0 # must be set to 0 when stdout_logfile=/dev/stdout stdout_logfile_maxbytes = 0 # must be set to 0 when stdout_logfile=/dev/stdout
autorestart=true autorestart=true