fresh indexing feature branch (#2790)

* fresh indexing feature branch

* cherry pick test

* Revert "cherry pick test"

This reverts commit 2a62422068.

* set multitenant so that vespa fields match when indexing

* cleanup pass

* mypy

* pass through env var to control celery indexing concurrency

* comments on task kickoff and some logging improvements

* use get_session_with_tenant

* comment out all of update.py

* rename to RedisConnectorIndexingFenceData

* first check num_indexing_workers

* refactor RedisConnectorIndexingFenceData

* comment out on_worker_process_init

* fix where num_indexing_workers falls back

* remove extra brace
This commit is contained in:
rkuo-danswer
2024-10-18 15:40:05 -07:00
committed by GitHub
parent 12cbbe6cee
commit 6913efef90
29 changed files with 1679 additions and 765 deletions

View File

@ -1,5 +1,4 @@
import argparse
import os
import subprocess
import threading
@ -17,7 +16,7 @@ def monitor_process(process_name: str, process: subprocess.Popen) -> None:
break
def run_jobs(exclude_indexing: bool) -> None:
def run_jobs() -> None:
# command setup
cmd_worker_primary = [
"celery",
@ -26,6 +25,7 @@ def run_jobs(exclude_indexing: bool) -> None:
"worker",
"--pool=threads",
"--concurrency=6",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"-n",
"primary@%n",
@ -40,6 +40,7 @@ def run_jobs(exclude_indexing: bool) -> None:
"worker",
"--pool=threads",
"--concurrency=16",
"--prefetch-multiplier=8",
"--loglevel=INFO",
"-n",
"light@%n",
@ -54,6 +55,7 @@ def run_jobs(exclude_indexing: bool) -> None:
"worker",
"--pool=threads",
"--concurrency=6",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"-n",
"heavy@%n",
@ -61,6 +63,20 @@ def run_jobs(exclude_indexing: bool) -> None:
"connector_pruning",
]
cmd_worker_indexing = [
"celery",
"-A",
"ee.danswer.background.celery.celery_app",
"worker",
"--pool=threads",
"--concurrency=1",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"-n",
"indexing@%n",
"--queues=connector_indexing",
]
cmd_beat = [
"celery",
"-A",
@ -82,6 +98,10 @@ def run_jobs(exclude_indexing: bool) -> None:
cmd_worker_heavy, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
worker_indexing_process = subprocess.Popen(
cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
beat_process = subprocess.Popen(
cmd_beat, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
@ -96,44 +116,26 @@ def run_jobs(exclude_indexing: bool) -> None:
worker_heavy_thread = threading.Thread(
target=monitor_process, args=("HEAVY", worker_heavy_process)
)
worker_indexing_thread = threading.Thread(
target=monitor_process, args=("INDEX", worker_indexing_process)
)
beat_thread = threading.Thread(target=monitor_process, args=("BEAT", beat_process))
worker_primary_thread.start()
worker_light_thread.start()
worker_heavy_thread.start()
worker_indexing_thread.start()
beat_thread.start()
if not exclude_indexing:
update_env = os.environ.copy()
update_env["PYTHONPATH"] = "."
cmd_indexing = ["python", "danswer/background/update.py"]
indexing_process = subprocess.Popen(
cmd_indexing,
env=update_env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
indexing_thread = threading.Thread(
target=monitor_process, args=("INDEXING", indexing_process)
)
indexing_thread.start()
indexing_thread.join()
worker_primary_thread.join()
worker_light_thread.join()
worker_heavy_thread.join()
worker_indexing_thread.join()
beat_thread.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run background jobs.")
parser.add_argument(
"--no-indexing", action="store_true", help="Do not run indexing process"
)
args = parser.parse_args()
run_jobs(args.no_indexing)
run_jobs()