try using a redis replica in some areas (#3748)

* try using a redis replica in some areas

* harden up replica usage

* comment

* slow down cloud dispatch temporarily

* add ignored syncing list back

* raise multiplier to 8

* comment out per tenant code (no longer used by fanout)

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer
2025-01-25 19:48:25 -08:00
committed by GitHub
parent cbf98c0128
commit d8a17a7238
8 changed files with 110 additions and 56 deletions

View File

@ -1,6 +1,5 @@
from datetime import timedelta from datetime import timedelta
from typing import Any from typing import Any
from typing import cast
from celery import Celery from celery import Celery
from celery import signals from celery import signals
@ -8,7 +7,6 @@ from celery.beat import PersistentScheduler # type: ignore
from celery.signals import beat_init from celery.signals import beat_init
import onyx.background.celery.apps.app_base as app_base import onyx.background.celery.apps.app_base as app_base
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
from onyx.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME from onyx.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME
from onyx.db.engine import get_all_tenant_ids from onyx.db.engine import get_all_tenant_ids
from onyx.db.engine import SqlEngine from onyx.db.engine import SqlEngine
@ -132,21 +130,25 @@ class DynamicTenantScheduler(PersistentScheduler):
# get current schedule and extract current tenants # get current schedule and extract current tenants
current_schedule = self.schedule.items() current_schedule = self.schedule.items()
current_tenants = set() # there are no more per tenant beat tasks, so comment this out
for task_name, _ in current_schedule: # NOTE: we may not actualy need this scheduler any more and should
task_name = cast(str, task_name) # test reverting to a regular beat schedule implementation
if task_name.startswith(ONYX_CLOUD_CELERY_TASK_PREFIX):
continue
if "_" in task_name: # current_tenants = set()
# example: "check-for-condition-tenant_12345678-abcd-efgh-ijkl-12345678" # for task_name, _ in current_schedule:
# -> "12345678-abcd-efgh-ijkl-12345678" # task_name = cast(str, task_name)
current_tenants.add(task_name.split("_")[-1]) # if task_name.startswith(ONYX_CLOUD_CELERY_TASK_PREFIX):
logger.info(f"Found {len(current_tenants)} existing items in schedule") # continue
for tenant_id in tenant_ids: # if "_" in task_name:
if tenant_id not in current_tenants: # # example: "check-for-condition-tenant_12345678-abcd-efgh-ijkl-12345678"
logger.info(f"Processing new tenant: {tenant_id}") # # -> "12345678-abcd-efgh-ijkl-12345678"
# current_tenants.add(task_name.split("_")[-1])
# logger.info(f"Found {len(current_tenants)} existing items in schedule")
# for tenant_id in tenant_ids:
# if tenant_id not in current_tenants:
# logger.info(f"Processing new tenant: {tenant_id}")
new_schedule = self._generate_schedule(tenant_ids) new_schedule = self._generate_schedule(tenant_ids)

View File

@ -16,6 +16,10 @@ from shared_configs.configs import MULTI_TENANT
# it's only important that they run relatively regularly # it's only important that they run relatively regularly
BEAT_EXPIRES_DEFAULT = 15 * 60 # 15 minutes (in seconds) BEAT_EXPIRES_DEFAULT = 15 * 60 # 15 minutes (in seconds)
# hack to slow down task dispatch in the cloud until
# we have a better implementation (backpressure, etc)
CLOUD_BEAT_SCHEDULE_MULTIPLIER = 8
# tasks that only run in the cloud # tasks that only run in the cloud
# the name attribute must start with ONYX_CLOUD_CELERY_TASK_PREFIX = "cloud" to be filtered # the name attribute must start with ONYX_CLOUD_CELERY_TASK_PREFIX = "cloud" to be filtered
# by the DynamicTenantScheduler # by the DynamicTenantScheduler
@ -24,7 +28,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-alembic", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-alembic",
"task": OnyxCeleryTask.CLOUD_CHECK_ALEMBIC, "task": OnyxCeleryTask.CLOUD_CHECK_ALEMBIC,
"schedule": timedelta(hours=1), "schedule": timedelta(hours=1 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"queue": OnyxCeleryQueues.MONITORING, "queue": OnyxCeleryQueues.MONITORING,
"priority": OnyxCeleryPriority.HIGH, "priority": OnyxCeleryPriority.HIGH,
@ -35,7 +39,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-indexing", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-indexing",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(seconds=15), "schedule": timedelta(seconds=15 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -47,7 +51,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-connector-deletion", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-connector-deletion",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(seconds=20), "schedule": timedelta(seconds=20 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -59,7 +63,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-vespa-sync", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-vespa-sync",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(seconds=20), "schedule": timedelta(seconds=20 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -71,7 +75,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-prune", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-prune",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(seconds=15), "schedule": timedelta(seconds=15 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -83,7 +87,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_monitor-vespa-sync", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_monitor-vespa-sync",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(seconds=5), "schedule": timedelta(seconds=15 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -95,7 +99,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-doc-permissions-sync", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-doc-permissions-sync",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(seconds=30), "schedule": timedelta(seconds=30 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -107,7 +111,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-external-group-sync", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-external-group-sync",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(seconds=20), "schedule": timedelta(seconds=20 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -119,7 +123,7 @@ cloud_tasks_to_schedule = [
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_monitor-background-processes", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_monitor-background-processes",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(minutes=5), "schedule": timedelta(minutes=5 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,
@ -137,7 +141,9 @@ if LLM_MODEL_UPDATE_API_URL:
{ {
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-llm-model-update", "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-llm-model-update",
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR, "task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
"schedule": timedelta(hours=1), # Check every hour "schedule": timedelta(
hours=1 * CLOUD_BEAT_SCHEDULE_MULTIPLIER
), # Check every hour
"options": { "options": {
"priority": OnyxCeleryPriority.HIGHEST, "priority": OnyxCeleryPriority.HIGHEST,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,

View File

@ -45,6 +45,7 @@ from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
from onyx.natural_language_processing.search_nlp_models import warm_up_bi_encoder from onyx.natural_language_processing.search_nlp_models import warm_up_bi_encoder
from onyx.redis.redis_connector import RedisConnector from onyx.redis.redis_connector import RedisConnector
from onyx.redis.redis_pool import get_redis_client from onyx.redis.redis_pool import get_redis_client
from onyx.redis.redis_pool import get_redis_replica_client
from onyx.redis.redis_pool import redis_lock_dump from onyx.redis.redis_pool import redis_lock_dump
from onyx.utils.logger import setup_logger from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import global_version from onyx.utils.variable_functionality import global_version
@ -69,6 +70,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
tasks_created = 0 tasks_created = 0
locked = False locked = False
redis_client = get_redis_client(tenant_id=tenant_id) redis_client = get_redis_client(tenant_id=tenant_id)
redis_client_replica = get_redis_replica_client(tenant_id=tenant_id)
# we need to use celery's redis client to access its redis data # we need to use celery's redis client to access its redis data
# (which lives on a different db number) # (which lives on a different db number)
@ -227,7 +229,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
# or be currently executing # or be currently executing
try: try:
validate_indexing_fences( validate_indexing_fences(
tenant_id, self.app, redis_client, redis_client_celery, lock_beat tenant_id, redis_client_replica, redis_client_celery, lock_beat
) )
except Exception: except Exception:
task_logger.exception("Exception while validating indexing fences") task_logger.exception("Exception while validating indexing fences")

View File

@ -291,8 +291,7 @@ def validate_indexing_fence(
def validate_indexing_fences( def validate_indexing_fences(
tenant_id: str | None, tenant_id: str | None,
celery_app: Celery, r_replica: Redis,
r: Redis,
r_celery: Redis, r_celery: Redis,
lock_beat: RedisLock, lock_beat: RedisLock,
) -> None: ) -> None:
@ -301,7 +300,9 @@ def validate_indexing_fences(
) )
# validate all existing indexing jobs # validate all existing indexing jobs
for key_bytes in r.scan_iter( # Use replica for this because the worst thing that happens
# is that we don't run the validation on this pass
for key_bytes in r_replica.scan_iter(
RedisConnectorIndex.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT RedisConnectorIndex.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
): ):
lock_beat.reacquire() lock_beat.reacquire()

View File

@ -33,6 +33,7 @@ from onyx.document_index.interfaces import VespaDocumentFields
from onyx.redis.redis_pool import get_redis_client from onyx.redis.redis_pool import get_redis_client
from onyx.redis.redis_pool import redis_lock_dump from onyx.redis.redis_pool import redis_lock_dump
from onyx.server.documents.models import ConnectorCredentialPairIdentifier from onyx.server.documents.models import ConnectorCredentialPairIdentifier
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES = 3 DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES = 3
@ -247,6 +248,10 @@ def cloud_beat_task_generator(
lock_beat.reacquire() lock_beat.reacquire()
last_lock_time = current_time last_lock_time = current_time
# needed in the cloud
if IGNORED_SYNCING_TENANT_LIST and tenant_id in IGNORED_SYNCING_TENANT_LIST:
continue
self.app.send_task( self.app.send_task(
task_name, task_name,
kwargs=dict( kwargs=dict(

View File

@ -78,6 +78,7 @@ from onyx.redis.redis_connector_index import RedisConnectorIndex
from onyx.redis.redis_connector_prune import RedisConnectorPrune from onyx.redis.redis_connector_prune import RedisConnectorPrune
from onyx.redis.redis_document_set import RedisDocumentSet from onyx.redis.redis_document_set import RedisDocumentSet
from onyx.redis.redis_pool import get_redis_client from onyx.redis.redis_pool import get_redis_client
from onyx.redis.redis_pool import get_redis_replica_client
from onyx.redis.redis_pool import redis_lock_dump from onyx.redis.redis_pool import redis_lock_dump
from onyx.redis.redis_pool import SCAN_ITER_COUNT_DEFAULT from onyx.redis.redis_pool import SCAN_ITER_COUNT_DEFAULT
from onyx.redis.redis_usergroup import RedisUserGroup from onyx.redis.redis_usergroup import RedisUserGroup
@ -895,6 +896,17 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
r = get_redis_client(tenant_id=tenant_id) r = get_redis_client(tenant_id=tenant_id)
# Replica usage notes
#
# False negatives are OK. (aka fail to to see a key that exists on the master).
# We simply skip the monitoring work and it will be caught on the next pass.
#
# False positives are not OK, and are possible if we clear a fence on the master and
# then read from the replica. In this case, monitoring work could be done on a fence
# that no longer exists. To avoid this, we scan from the replica, but double check
# the result on the master.
r_replica = get_redis_replica_client(tenant_id=tenant_id)
lock_beat: RedisLock = r.lock( lock_beat: RedisLock = r.lock(
OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK, OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK,
timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
@ -954,17 +966,19 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
# scan and monitor activity to completion # scan and monitor activity to completion
phase_start = time.monotonic() phase_start = time.monotonic()
lock_beat.reacquire() lock_beat.reacquire()
if r.exists(RedisConnectorCredentialPair.get_fence_key()): if r_replica.exists(RedisConnectorCredentialPair.get_fence_key()):
monitor_connector_taskset(r) if r.exists(RedisConnectorCredentialPair.get_fence_key()):
monitor_connector_taskset(r)
timings["connector"] = time.monotonic() - phase_start timings["connector"] = time.monotonic() - phase_start
timings["connector_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) timings["connector_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
phase_start = time.monotonic() phase_start = time.monotonic()
lock_beat.reacquire() lock_beat.reacquire()
for key_bytes in r.scan_iter( for key_bytes in r_replica.scan_iter(
RedisConnectorDelete.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT RedisConnectorDelete.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
): ):
monitor_connector_deletion_taskset(tenant_id, key_bytes, r) if r.exists(key_bytes):
monitor_connector_deletion_taskset(tenant_id, key_bytes, r)
lock_beat.reacquire() lock_beat.reacquire()
timings["connector_deletion"] = time.monotonic() - phase_start timings["connector_deletion"] = time.monotonic() - phase_start
@ -974,66 +988,74 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
phase_start = time.monotonic() phase_start = time.monotonic()
lock_beat.reacquire() lock_beat.reacquire()
for key_bytes in r.scan_iter( for key_bytes in r_replica.scan_iter(
RedisDocumentSet.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT RedisDocumentSet.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
): ):
with get_session_with_tenant(tenant_id) as db_session: if r.exists(key_bytes):
monitor_document_set_taskset(tenant_id, key_bytes, r, db_session) with get_session_with_tenant(tenant_id) as db_session:
monitor_document_set_taskset(tenant_id, key_bytes, r, db_session)
lock_beat.reacquire() lock_beat.reacquire()
timings["documentset"] = time.monotonic() - phase_start timings["documentset"] = time.monotonic() - phase_start
timings["documentset_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) timings["documentset_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
phase_start = time.monotonic() phase_start = time.monotonic()
lock_beat.reacquire() lock_beat.reacquire()
for key_bytes in r.scan_iter( for key_bytes in r_replica.scan_iter(
RedisUserGroup.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT RedisUserGroup.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
): ):
monitor_usergroup_taskset = fetch_versioned_implementation_with_fallback( if r.exists(key_bytes):
"onyx.background.celery.tasks.vespa.tasks", monitor_usergroup_taskset = (
"monitor_usergroup_taskset", fetch_versioned_implementation_with_fallback(
noop_fallback, "onyx.background.celery.tasks.vespa.tasks",
) "monitor_usergroup_taskset",
with get_session_with_tenant(tenant_id) as db_session: noop_fallback,
monitor_usergroup_taskset(tenant_id, key_bytes, r, db_session) )
)
with get_session_with_tenant(tenant_id) as db_session:
monitor_usergroup_taskset(tenant_id, key_bytes, r, db_session)
lock_beat.reacquire() lock_beat.reacquire()
timings["usergroup"] = time.monotonic() - phase_start timings["usergroup"] = time.monotonic() - phase_start
timings["usergroup_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) timings["usergroup_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
phase_start = time.monotonic() phase_start = time.monotonic()
lock_beat.reacquire() lock_beat.reacquire()
for key_bytes in r.scan_iter( for key_bytes in r_replica.scan_iter(
RedisConnectorPrune.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT RedisConnectorPrune.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
): ):
with get_session_with_tenant(tenant_id) as db_session: if r.exists(key_bytes):
monitor_ccpair_pruning_taskset(tenant_id, key_bytes, r, db_session) with get_session_with_tenant(tenant_id) as db_session:
monitor_ccpair_pruning_taskset(tenant_id, key_bytes, r, db_session)
lock_beat.reacquire() lock_beat.reacquire()
timings["pruning"] = time.monotonic() - phase_start timings["pruning"] = time.monotonic() - phase_start
timings["pruning_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) timings["pruning_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
phase_start = time.monotonic() phase_start = time.monotonic()
lock_beat.reacquire() lock_beat.reacquire()
for key_bytes in r.scan_iter( for key_bytes in r_replica.scan_iter(
RedisConnectorIndex.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT RedisConnectorIndex.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
): ):
with get_session_with_tenant(tenant_id) as db_session: if r.exists(key_bytes):
monitor_ccpair_indexing_taskset(tenant_id, key_bytes, r, db_session) with get_session_with_tenant(tenant_id) as db_session:
monitor_ccpair_indexing_taskset(tenant_id, key_bytes, r, db_session)
lock_beat.reacquire() lock_beat.reacquire()
timings["indexing"] = time.monotonic() - phase_start timings["indexing"] = time.monotonic() - phase_start
timings["indexing_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) timings["indexing_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
phase_start = time.monotonic() phase_start = time.monotonic()
lock_beat.reacquire() lock_beat.reacquire()
for key_bytes in r.scan_iter( for key_bytes in r_replica.scan_iter(
RedisConnectorPermissionSync.FENCE_PREFIX + "*", RedisConnectorPermissionSync.FENCE_PREFIX + "*",
count=SCAN_ITER_COUNT_DEFAULT, count=SCAN_ITER_COUNT_DEFAULT,
): ):
with get_session_with_tenant(tenant_id) as db_session: if r.exists(key_bytes):
monitor_ccpair_permissions_taskset(tenant_id, key_bytes, r, db_session) with get_session_with_tenant(tenant_id) as db_session:
monitor_ccpair_permissions_taskset(
tenant_id, key_bytes, r, db_session
)
lock_beat.reacquire() lock_beat.reacquire()
timings["permissions"] = time.monotonic() - phase_start timings["permissions"] = time.monotonic() - phase_start
timings["permissions_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) timings["permissions_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
except SoftTimeLimitExceeded: except SoftTimeLimitExceeded:
task_logger.info( task_logger.info(
"Soft time limit exceeded, task is being terminated gracefully." "Soft time limit exceeded, task is being terminated gracefully."

View File

@ -200,6 +200,8 @@ REDIS_HOST = os.environ.get("REDIS_HOST") or "localhost"
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD") or "" REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD") or ""
# this assumes that other redis settings remain the same as the primary
REDIS_REPLICA_HOST = os.environ.get("REDIS_REPLICA_HOST") or REDIS_HOST
REDIS_AUTH_KEY_PREFIX = "fastapi_users_token:" REDIS_AUTH_KEY_PREFIX = "fastapi_users_token:"

View File

@ -21,6 +21,7 @@ from onyx.configs.app_configs import REDIS_HOST
from onyx.configs.app_configs import REDIS_PASSWORD from onyx.configs.app_configs import REDIS_PASSWORD
from onyx.configs.app_configs import REDIS_POOL_MAX_CONNECTIONS from onyx.configs.app_configs import REDIS_POOL_MAX_CONNECTIONS
from onyx.configs.app_configs import REDIS_PORT from onyx.configs.app_configs import REDIS_PORT
from onyx.configs.app_configs import REDIS_REPLICA_HOST
from onyx.configs.app_configs import REDIS_SSL from onyx.configs.app_configs import REDIS_SSL
from onyx.configs.app_configs import REDIS_SSL_CA_CERTS from onyx.configs.app_configs import REDIS_SSL_CA_CERTS
from onyx.configs.app_configs import REDIS_SSL_CERT_REQS from onyx.configs.app_configs import REDIS_SSL_CERT_REQS
@ -132,23 +133,32 @@ class RedisPool:
_instance: Optional["RedisPool"] = None _instance: Optional["RedisPool"] = None
_lock: threading.Lock = threading.Lock() _lock: threading.Lock = threading.Lock()
_pool: redis.BlockingConnectionPool _pool: redis.BlockingConnectionPool
_replica_pool: redis.BlockingConnectionPool
def __new__(cls) -> "RedisPool": def __new__(cls) -> "RedisPool":
if not cls._instance: if not cls._instance:
with cls._lock: with cls._lock:
if not cls._instance: if not cls._instance:
cls._instance = super(RedisPool, cls).__new__(cls) cls._instance = super(RedisPool, cls).__new__(cls)
cls._instance._init_pool() cls._instance._init_pools()
return cls._instance return cls._instance
def _init_pool(self) -> None: def _init_pools(self) -> None:
self._pool = RedisPool.create_pool(ssl=REDIS_SSL) self._pool = RedisPool.create_pool(ssl=REDIS_SSL)
self._replica_pool = RedisPool.create_pool(
host=REDIS_REPLICA_HOST, ssl=REDIS_SSL
)
def get_client(self, tenant_id: str | None) -> Redis: def get_client(self, tenant_id: str | None) -> Redis:
if tenant_id is None: if tenant_id is None:
tenant_id = "public" tenant_id = "public"
return TenantRedis(tenant_id, connection_pool=self._pool) return TenantRedis(tenant_id, connection_pool=self._pool)
def get_replica_client(self, tenant_id: str | None) -> Redis:
if tenant_id is None:
tenant_id = "public"
return TenantRedis(tenant_id, connection_pool=self._replica_pool)
@staticmethod @staticmethod
def create_pool( def create_pool(
host: str = REDIS_HOST, host: str = REDIS_HOST,
@ -212,6 +222,10 @@ def get_redis_client(*, tenant_id: str | None) -> Redis:
return redis_pool.get_client(tenant_id) return redis_pool.get_client(tenant_id)
def get_redis_replica_client(*, tenant_id: str | None) -> Redis:
return redis_pool.get_replica_client(tenant_id)
SSL_CERT_REQS_MAP = { SSL_CERT_REQS_MAP = {
"none": ssl.CERT_NONE, "none": ssl.CERT_NONE,
"optional": ssl.CERT_OPTIONAL, "optional": ssl.CERT_OPTIONAL,