mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-01 10:10:49 +02:00
Feature/celery multi (#2470)
* first cut at redis * some new helper functions for the db * ignore kombu tables in alembic migrations (used by celery) * multiline commands for readability, add vespa_metadata_sync queue to worker * typo fix * fix returning tuple fields * add constants * fix _get_access_for_document * docstrings! * fix double function declaration and typing * fix type hinting * add a global redis pool * Add get_document function * use task_logger in various celery tasks * add celeryconfig.py to simplify configuration. Will be used in a subsequent commit * Add celery redis helper. used in a subsequent PR * kombu warning getting spammy since celery is not self managing its queue in Postgres any more * add last_modified and last_synced to documents * fix task naming convention * use celeryconfig.py * the big one. adds queues and tasks, updates functions to use the queues with priorities, etc * change vespa index log line to debug * mypy fixes * update alembic migration * fix fence ordering, rename to "monitor", fix fetch_versioned_implementation call * mypy * switch to monotonic time * fix startup dependencies on redis * rebase alembic migration * kombu cleanup - fail silently * mypy * add redis_host environment override * update REDIS_HOST env var in docker-compose.dev.yml * update the rest of the docker files * in flight * harden indexing-status endpoint against db changes happening in the background. Needs further improvement but OK for now. * allow no task syncs to run because we create certain objects with no entries but initially marked as out of date * add back writing to vespa on indexing * actually working connector deletion * update contributing guide * backporting fixes from background_deletion * renaming cache to cache_volume * add redis password to various deployments * try setting up pr testing for helm * fix indent * hopefully this release version actually exists * fix command line option to --chart-dirs * fetch-depth 0 * edit values.yaml * try setting ct working directory * bypass testing only on change for now * move files and lint them * update helm testing * some issues suggest using --config works * add vespa repo * add postgresql repo * increase timeout * try amd64 runner * fix redis password reference * add comment to helm chart testing workflow * rename helm testing workflow to disable it * adding clarifying comments * address code review * missed a file * remove commented warning ... just not needed * fix imports * refactor to use update_single * mypy fixes * add vespa test * multiple celery workers * update logs as well and set prefetch multipliers appropriate to the worker intent * add db refresh to connector deletion * add some preliminary locking * organize tasks into separate files * celery auto associates tasks created inside another task, which bloats the result metadata considerably. trail=False prevents this. * code review fixes * move monitor_usergroup_taskset to ee, improve logging * add multi workers to dev_run_background_jobs.py * update supervisord with some recommended settings for celery * name celery workers and shorten dev script prefixing * add configurable sql alchemy engine settings on startup (needed for various intents like API server, different celery workers and tasks, etc) * fix comments * autoscale sqlalchemy pool size to celery concurrency (allow override later?) * supervisord needs the percent symbols escaped * use name as primary check, some minor refactoring and type hinting too. * addressing code review * fix import * fix prune_documents_task references --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com>
This commit is contained in:
@ -18,7 +18,8 @@ def monitor_process(process_name: str, process: subprocess.Popen) -> None:
|
||||
|
||||
|
||||
def run_jobs(exclude_indexing: bool) -> None:
|
||||
cmd_worker = [
|
||||
# command setup
|
||||
cmd_worker_primary = [
|
||||
"celery",
|
||||
"-A",
|
||||
"ee.danswer.background.celery.celery_app",
|
||||
@ -26,8 +27,38 @@ def run_jobs(exclude_indexing: bool) -> None:
|
||||
"--pool=threads",
|
||||
"--concurrency=6",
|
||||
"--loglevel=INFO",
|
||||
"-n",
|
||||
"primary@%n",
|
||||
"-Q",
|
||||
"celery,vespa_metadata_sync,connector_deletion",
|
||||
"celery",
|
||||
]
|
||||
|
||||
cmd_worker_light = [
|
||||
"celery",
|
||||
"-A",
|
||||
"ee.danswer.background.celery.celery_app",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=16",
|
||||
"--loglevel=INFO",
|
||||
"-n",
|
||||
"light@%n",
|
||||
"-Q",
|
||||
"vespa_metadata_sync,connector_deletion",
|
||||
]
|
||||
|
||||
cmd_worker_heavy = [
|
||||
"celery",
|
||||
"-A",
|
||||
"ee.danswer.background.celery.celery_app",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=6",
|
||||
"--loglevel=INFO",
|
||||
"-n",
|
||||
"heavy@%n",
|
||||
"-Q",
|
||||
"connector_pruning",
|
||||
]
|
||||
|
||||
cmd_beat = [
|
||||
@ -38,19 +69,38 @@ def run_jobs(exclude_indexing: bool) -> None:
|
||||
"--loglevel=INFO",
|
||||
]
|
||||
|
||||
worker_process = subprocess.Popen(
|
||||
cmd_worker, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
# spawn processes
|
||||
worker_primary_process = subprocess.Popen(
|
||||
cmd_worker_primary, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
|
||||
worker_light_process = subprocess.Popen(
|
||||
cmd_worker_light, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
|
||||
worker_heavy_process = subprocess.Popen(
|
||||
cmd_worker_heavy, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
|
||||
beat_process = subprocess.Popen(
|
||||
cmd_beat, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||
)
|
||||
|
||||
worker_thread = threading.Thread(
|
||||
target=monitor_process, args=("WORKER", worker_process)
|
||||
# monitor threads
|
||||
worker_primary_thread = threading.Thread(
|
||||
target=monitor_process, args=("PRIMARY", worker_primary_process)
|
||||
)
|
||||
worker_light_thread = threading.Thread(
|
||||
target=monitor_process, args=("LIGHT", worker_light_process)
|
||||
)
|
||||
worker_heavy_thread = threading.Thread(
|
||||
target=monitor_process, args=("HEAVY", worker_heavy_process)
|
||||
)
|
||||
beat_thread = threading.Thread(target=monitor_process, args=("BEAT", beat_process))
|
||||
|
||||
worker_thread.start()
|
||||
worker_primary_thread.start()
|
||||
worker_light_thread.start()
|
||||
worker_heavy_thread.start()
|
||||
beat_thread.start()
|
||||
|
||||
if not exclude_indexing:
|
||||
@ -93,7 +143,9 @@ def run_jobs(exclude_indexing: bool) -> None:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
worker_thread.join()
|
||||
worker_primary_thread.join()
|
||||
worker_light_thread.join()
|
||||
worker_heavy_thread.join()
|
||||
beat_thread.join()
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user