mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-08-08 14:02:09 +02:00
Feature/celery beat watchdog (#4534)
* upgrade celery to release version * make the watchdog script more reusable * use constant * code review * catch interrupt --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
@@ -152,7 +152,10 @@ class DynamicTenantScheduler(PersistentScheduler):
|
|||||||
current_schedule = self.schedule.items()
|
current_schedule = self.schedule.items()
|
||||||
|
|
||||||
# get potential new state
|
# get potential new state
|
||||||
|
try:
|
||||||
beat_multiplier = OnyxRuntime.get_beat_multiplier()
|
beat_multiplier = OnyxRuntime.get_beat_multiplier()
|
||||||
|
except Exception:
|
||||||
|
beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
|
||||||
|
|
||||||
new_schedule = self._generate_schedule(tenant_ids, beat_multiplier)
|
new_schedule = self._generate_schedule(tenant_ids, beat_multiplier)
|
||||||
|
|
||||||
|
@@ -226,6 +226,16 @@ if not MULTI_TENANT:
|
|||||||
"queue": OnyxCeleryQueues.MONITORING,
|
"queue": OnyxCeleryQueues.MONITORING,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "celery-beat-heartbeat",
|
||||||
|
"task": OnyxCeleryTask.CELERY_BEAT_HEARTBEAT,
|
||||||
|
"schedule": timedelta(minutes=1),
|
||||||
|
"options": {
|
||||||
|
"priority": OnyxCeleryPriority.HIGHEST,
|
||||||
|
"expires": BEAT_EXPIRES_DEFAULT,
|
||||||
|
"queue": OnyxCeleryQueues.PRIMARY,
|
||||||
|
},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -6,6 +6,7 @@ import httpx
|
|||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from celery import Task
|
from celery import Task
|
||||||
from celery.exceptions import SoftTimeLimitExceeded
|
from celery.exceptions import SoftTimeLimitExceeded
|
||||||
|
from redis import Redis
|
||||||
from redis.lock import Lock as RedisLock
|
from redis.lock import Lock as RedisLock
|
||||||
from tenacity import RetryError
|
from tenacity import RetryError
|
||||||
|
|
||||||
@@ -15,6 +16,7 @@ from onyx.background.celery.apps.app_base import task_logger
|
|||||||
from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT
|
from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT
|
||||||
from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
|
from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
|
||||||
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
|
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
|
||||||
|
from onyx.configs.constants import ONYX_CELERY_BEAT_HEARTBEAT_KEY
|
||||||
from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
|
from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
|
||||||
from onyx.configs.constants import OnyxCeleryPriority
|
from onyx.configs.constants import OnyxCeleryPriority
|
||||||
from onyx.configs.constants import OnyxCeleryTask
|
from onyx.configs.constants import OnyxCeleryTask
|
||||||
@@ -353,3 +355,16 @@ def cloud_beat_task_generator(
|
|||||||
f"elapsed={time_elapsed:.2f}"
|
f"elapsed={time_elapsed:.2f}"
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(name=OnyxCeleryTask.CELERY_BEAT_HEARTBEAT, ignore_result=True, bind=True)
|
||||||
|
def celery_beat_heartbeat(self: Task, *, tenant_id: str) -> None:
|
||||||
|
"""When this task runs, it writes a key to Redis with a TTL.
|
||||||
|
|
||||||
|
An external observer can check this key to figure out if the celery beat is still running.
|
||||||
|
"""
|
||||||
|
time_start = time.monotonic()
|
||||||
|
r: Redis = get_redis_client()
|
||||||
|
r.set(ONYX_CELERY_BEAT_HEARTBEAT_KEY, 1, ex=600)
|
||||||
|
time_elapsed = time.monotonic() - time_start
|
||||||
|
task_logger.info(f"celery_beat_heartbeat finished: " f"elapsed={time_elapsed:.2f}")
|
||||||
|
@@ -425,6 +425,7 @@ class OnyxCeleryTask:
|
|||||||
MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes"
|
MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes"
|
||||||
MONITOR_CELERY_QUEUES = "monitor_celery_queues"
|
MONITOR_CELERY_QUEUES = "monitor_celery_queues"
|
||||||
MONITOR_PROCESS_MEMORY = "monitor_process_memory"
|
MONITOR_PROCESS_MEMORY = "monitor_process_memory"
|
||||||
|
CELERY_BEAT_HEARTBEAT = "celery_beat_heartbeat"
|
||||||
|
|
||||||
KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
|
KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
|
||||||
CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
|
CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
|
||||||
@@ -444,6 +445,9 @@ class OnyxCeleryTask:
|
|||||||
AUTOGENERATE_USAGE_REPORT_TASK = "autogenerate_usage_report_task"
|
AUTOGENERATE_USAGE_REPORT_TASK = "autogenerate_usage_report_task"
|
||||||
|
|
||||||
|
|
||||||
|
# this needs to correspond to the matching entry in supervisord
|
||||||
|
ONYX_CELERY_BEAT_HEARTBEAT_KEY = "onyx:celery:beat:heartbeat"
|
||||||
|
|
||||||
REDIS_SOCKET_KEEPALIVE_OPTIONS = {}
|
REDIS_SOCKET_KEEPALIVE_OPTIONS = {}
|
||||||
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
|
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
|
||||||
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3
|
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3
|
||||||
|
@@ -160,6 +160,7 @@ class ConfluenceConnector(
|
|||||||
}
|
}
|
||||||
|
|
||||||
def set_allow_images(self, value: bool) -> None:
|
def set_allow_images(self, value: bool) -> None:
|
||||||
|
logger.info(f"Setting allow_images to {value}.")
|
||||||
self.allow_images = value
|
self.allow_images = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
109
backend/onyx/utils/supervisord_watchdog.py
Normal file
109
backend/onyx/utils/supervisord_watchdog.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
|
||||||
|
from onyx.redis.redis_pool import get_redis_client
|
||||||
|
from onyx.utils.logger import setup_logger
|
||||||
|
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
MAX_AGE_SECONDS = 900 # how old the heartbeat can be
|
||||||
|
CHECK_INTERVAL = 60 # how often to check
|
||||||
|
MAX_LOOKUP_FAILURES = 5
|
||||||
|
|
||||||
|
|
||||||
|
def main(key: str, program: str, conf: str) -> None:
|
||||||
|
"""This script will restart the watchdog'd supervisord process via supervisorctl.
|
||||||
|
|
||||||
|
This process continually looks up a specific redis key. If it is missing for a
|
||||||
|
consecutive number of times and the last successful lookup is more
|
||||||
|
than a threshold time, the specified program will be restarted.
|
||||||
|
"""
|
||||||
|
logger.info(f"supervisord_watchdog starting: program={program} conf={conf}")
|
||||||
|
|
||||||
|
r = get_redis_client()
|
||||||
|
|
||||||
|
last_heartbeat = time.monotonic()
|
||||||
|
num_lookup_failures = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
time.sleep(CHECK_INTERVAL)
|
||||||
|
|
||||||
|
now = time.monotonic()
|
||||||
|
|
||||||
|
# check for the key ... handle any exception gracefully
|
||||||
|
try:
|
||||||
|
heartbeat = r.exists(key)
|
||||||
|
except Exception:
|
||||||
|
logger.exception(
|
||||||
|
f"Exception checking for celery beat heartbeat: key={key}."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# happy path ... just continue
|
||||||
|
if heartbeat:
|
||||||
|
logger.debug(f"Key lookup succeeded: key={key}")
|
||||||
|
last_heartbeat = time.monotonic()
|
||||||
|
num_lookup_failures = 0
|
||||||
|
continue
|
||||||
|
|
||||||
|
# if we haven't exceeded the max lookup failures, continue
|
||||||
|
num_lookup_failures += 1
|
||||||
|
if num_lookup_failures <= MAX_LOOKUP_FAILURES:
|
||||||
|
logger.warning(
|
||||||
|
f"Key lookup failed: key={key} "
|
||||||
|
f"lookup_failures={num_lookup_failures} "
|
||||||
|
f"max_lookup_failures={MAX_LOOKUP_FAILURES}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# if we haven't exceeded the max missing key timeout threshold, continue
|
||||||
|
elapsed = now - last_heartbeat
|
||||||
|
if elapsed <= MAX_AGE_SECONDS:
|
||||||
|
logger.warning(
|
||||||
|
f"Key lookup failed: key={key} "
|
||||||
|
f"lookup_failures={num_lookup_failures} "
|
||||||
|
f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
|
||||||
|
f"elapsed={elapsed:.2f} "
|
||||||
|
f"elapsed_threshold={MAX_AGE_SECONDS}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# all conditions have been exceeded ... restart the process
|
||||||
|
logger.warning(
|
||||||
|
f"Key lookup failure thresholds exceeded - restarting {program}: "
|
||||||
|
f"key={key} "
|
||||||
|
f"lookup_failures={num_lookup_failures} "
|
||||||
|
f"max_lookup_failures={MAX_LOOKUP_FAILURES} "
|
||||||
|
f"elapsed={elapsed:.2f} "
|
||||||
|
f"elapsed_threshold={MAX_AGE_SECONDS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
subprocess.call(["supervisorctl", "-c", conf, "restart", program])
|
||||||
|
|
||||||
|
# reset state so that we properly delay until the next restart
|
||||||
|
# instead of continually restarting
|
||||||
|
num_lookup_failures = 0
|
||||||
|
last_heartbeat = time.monotonic()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Caught interrupt, exiting watchdog.")
|
||||||
|
|
||||||
|
logger.info("supervisord_watchdog exiting.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Supervisord Watchdog")
|
||||||
|
parser.add_argument("--key", help="The redis key to watch", required=True)
|
||||||
|
parser.add_argument(
|
||||||
|
"--program", help="The supervisord program to restart", required=True
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--conf", type=str, help="Path to supervisord config file", required=True
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args.key, args.program, args.conf)
|
@@ -5,7 +5,7 @@ asyncpg==0.27.0
|
|||||||
atlassian-python-api==3.41.16
|
atlassian-python-api==3.41.16
|
||||||
beautifulsoup4==4.12.3
|
beautifulsoup4==4.12.3
|
||||||
boto3==1.36.23
|
boto3==1.36.23
|
||||||
celery==5.5.0b4
|
celery==5.5.1
|
||||||
chardet==5.2.0
|
chardet==5.2.0
|
||||||
dask==2023.8.1
|
dask==2023.8.1
|
||||||
ddtrace==2.6.5
|
ddtrace==2.6.5
|
||||||
|
@@ -3,6 +3,18 @@ nodaemon=true
|
|||||||
user=root
|
user=root
|
||||||
logfile=/var/log/supervisord.log
|
logfile=/var/log/supervisord.log
|
||||||
|
|
||||||
|
# region enable supervisorctl usage
|
||||||
|
[supervisorctl]
|
||||||
|
serverurl=unix:///tmp/supervisor.sock
|
||||||
|
|
||||||
|
[unix_http_server]
|
||||||
|
file=/tmp/supervisor.sock
|
||||||
|
chmod=0700
|
||||||
|
|
||||||
|
[rpcinterface:supervisor]
|
||||||
|
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||||
|
# endregion enable supervisorctl usage
|
||||||
|
|
||||||
# Background jobs that must be run async due to long time to completion
|
# Background jobs that must be run async due to long time to completion
|
||||||
# NOTE: due to an issue with Celery + SQLAlchemy
|
# NOTE: due to an issue with Celery + SQLAlchemy
|
||||||
# (https://github.com/celery/celery/issues/7007#issuecomment-1740139367)
|
# (https://github.com/celery/celery/issues/7007#issuecomment-1740139367)
|
||||||
@@ -98,6 +110,20 @@ redirect_stderr=true
|
|||||||
startsecs=10
|
startsecs=10
|
||||||
stopasgroup=true
|
stopasgroup=true
|
||||||
|
|
||||||
|
# watchdog to detect and restart the beat in case of inactivity
|
||||||
|
# supervisord only restarts the process if it's dead
|
||||||
|
# make sure this key matches ONYX_CELERY_BEAT_HEARTBEAT_KEY
|
||||||
|
[program:supervisord_watchdog_celery_beat]
|
||||||
|
command=python onyx/utils/supervisord_watchdog.py
|
||||||
|
--conf /etc/supervisor/conf.d/supervisord.conf
|
||||||
|
--key "onyx:celery:beat:heartbeat"
|
||||||
|
--program celery_beat
|
||||||
|
stdout_logfile=/var/log/supervisord_watchdog_celery_beat.log
|
||||||
|
stdout_logfile_maxbytes=16MB
|
||||||
|
redirect_stderr=true
|
||||||
|
startsecs=10
|
||||||
|
stopasgroup=true
|
||||||
|
|
||||||
# Listens for Slack messages and responds with answers
|
# Listens for Slack messages and responds with answers
|
||||||
# for all channels that the OnyxBot has been added to.
|
# for all channels that the OnyxBot has been added to.
|
||||||
# If not setup, this will just fail 5 times and then stop.
|
# If not setup, this will just fail 5 times and then stop.
|
||||||
@@ -123,6 +149,7 @@ command=tail -qF
|
|||||||
/var/log/celery_worker_user_files_indexing.log
|
/var/log/celery_worker_user_files_indexing.log
|
||||||
/var/log/celery_worker_monitoring.log
|
/var/log/celery_worker_monitoring.log
|
||||||
/var/log/slack_bot.log
|
/var/log/slack_bot.log
|
||||||
|
/var/log/supervisord_watchdog_celery_beat.log
|
||||||
stdout_logfile=/dev/stdout
|
stdout_logfile=/dev/stdout
|
||||||
stdout_logfile_maxbytes = 0 # must be set to 0 when stdout_logfile=/dev/stdout
|
stdout_logfile_maxbytes = 0 # must be set to 0 when stdout_logfile=/dev/stdout
|
||||||
autorestart=true
|
autorestart=true
|
||||||
|
Reference in New Issue
Block a user