danswer/backend/supervisord.conf
Chris Weaver f1fc8ac19b
Connector checkpointing (#3876)
* wip checkpointing/continue on failure

more stuff for checkpointing

Basic implementation

FE stuff

More checkpointing/failure handling

rebase

rebase

initial scaffolding for IT

IT to test checkpointing

Cleanup

cleanup

Fix it

Rebase

Add todo

Fix actions IT

Test more

Pagination + fixes + cleanup

Fix IT networking

fix it

* rebase

* Address misc comments

* Address comments

* Remove unused router

* rebase

* Fix mypy

* Fixes

* fix it

* Fix tests

* Add drop index

* Add retries

* reset lock timeout

* Try hard drop of schema

* Add timeout/retries to downgrade

* rebase

* test

* test

* test

* Close all connections

* test closing idle only

* Fix it

* fix

* try using null pool

* Test

* fix

* rebase

* log

* Fix

* apply null pool

* Fix other test

* Fix quality checks

* Test not using the fixture

* Fix ordering

* fix test

* Change pooling behavior
2025-02-16 02:34:39 +00:00

115 lines
3.8 KiB
Plaintext

[supervisord]
nodaemon=true
user=root
logfile=/var/log/supervisord.log
# Background jobs that must be run async due to long time to completion
# NOTE: due to an issue with Celery + SQLAlchemy
# (https://github.com/celery/celery/issues/7007#issuecomment-1740139367)
# we must use the threads pool instead of the default prefork pool for now
# in order to avoid intermittent errors like:
# `billiard.exceptions.WorkerLostError: Worker exited prematurely: signal 11 (SIGSEGV)`.
#
# This means workers will not be able take advantage of multiple CPU cores
# on a system, but this should be okay for now since all our celery tasks are
# relatively compute-light (e.g. they tend to just make a bunch of requests to
# Vespa / Postgres)
[program:celery_worker_primary]
command=celery -A onyx.background.celery.versioned_apps.primary worker
--loglevel=INFO
--hostname=primary@%%n
-Q celery
stdout_logfile=/var/log/celery_worker_primary.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
autorestart=true
startsecs=10
stopasgroup=true
# NOTE: only allowing configuration here and not in the other celery workers,
# since this is often the bottleneck for "sync" jobs (e.g. document set syncing,
# user group syncing, deletion, etc.)
[program:celery_worker_light]
command=celery -A onyx.background.celery.versioned_apps.light worker
--loglevel=INFO
--hostname=light@%%n
-Q vespa_metadata_sync,connector_deletion,doc_permissions_upsert,checkpoint_cleanup
stdout_logfile=/var/log/celery_worker_light.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
autorestart=true
startsecs=10
stopasgroup=true
[program:celery_worker_heavy]
command=celery -A onyx.background.celery.versioned_apps.heavy worker
--loglevel=INFO
--hostname=heavy@%%n
-Q connector_pruning,connector_doc_permissions_sync,connector_external_group_sync
stdout_logfile=/var/log/celery_worker_heavy.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
autorestart=true
startsecs=10
stopasgroup=true
[program:celery_worker_indexing]
command=celery -A onyx.background.celery.versioned_apps.indexing worker
--loglevel=INFO
--hostname=indexing@%%n
-Q connector_indexing
stdout_logfile=/var/log/celery_worker_indexing.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
autorestart=true
startsecs=10
stopasgroup=true
[program:celery_worker_monitoring]
command=celery -A onyx.background.celery.versioned_apps.monitoring worker
--loglevel=INFO
--hostname=monitoring@%%n
-Q monitoring
stdout_logfile=/var/log/celery_worker_monitoring.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
autorestart=true
startsecs=10
stopasgroup=true
# Job scheduler for periodic tasks
[program:celery_beat]
command=celery -A onyx.background.celery.versioned_apps.beat beat
stdout_logfile=/var/log/celery_beat.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
startsecs=10
stopasgroup=true
# Listens for Slack messages and responds with answers
# for all channels that the OnyxBot has been added to.
# If not setup, this will just fail 5 times and then stop.
# More details on setup here: https://docs.onyx.app/slack_bot_setup
[program:slack_bot]
command=python onyx/onyxbot/slack/listener.py
stdout_logfile=/var/log/slack_bot.log
stdout_logfile_maxbytes=16MB
redirect_stderr=true
autorestart=true
startretries=5
startsecs=60
# Pushes all logs from the above programs to stdout
# No log rotation here, since it's stdout it's handled by the Docker container logging
[program:log-redirect-handler]
command=tail -qF
/var/log/celery_beat.log
/var/log/celery_worker_primary.log
/var/log/celery_worker_light.log
/var/log/celery_worker_heavy.log
/var/log/celery_worker_indexing.log
/var/log/slack_bot.log
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes = 0 # must be set to 0 when stdout_logfile=/dev/stdout
autorestart=true