mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-12 21:00:07 +02:00
Bugfix/harden activity timeout (#4545)
* add some hardening * add info memory logging * fix last_observed * remove log spam * properly cache last activity details * default values --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
parent
a8a5a82251
commit
fa80842afe
@ -108,14 +108,19 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
|||||||
r = get_redis_client(tenant_id=POSTGRES_DEFAULT_SCHEMA)
|
r = get_redis_client(tenant_id=POSTGRES_DEFAULT_SCHEMA)
|
||||||
|
|
||||||
# Log the role and slave count - being connected to a slave or slave count > 0 could be problematic
|
# Log the role and slave count - being connected to a slave or slave count > 0 could be problematic
|
||||||
info: dict[str, Any] = cast(dict, r.info("replication"))
|
replication_info: dict[str, Any] = cast(dict, r.info("replication"))
|
||||||
role: str = cast(str, info.get("role"))
|
role: str = cast(str, replication_info.get("role", ""))
|
||||||
connected_slaves: int = info.get("connected_slaves", 0)
|
connected_slaves: int = replication_info.get("connected_slaves", 0)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Redis INFO REPLICATION: role={role} connected_slaves={connected_slaves}"
|
f"Redis INFO REPLICATION: role={role} connected_slaves={connected_slaves}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
memory_info: dict[str, Any] = cast(dict, r.info("memory"))
|
||||||
|
maxmemory_policy: str = cast(str, memory_info.get("maxmemory_policy", ""))
|
||||||
|
|
||||||
|
logger.info(f"Redis INFO MEMORY: maxmemory_policy={maxmemory_policy}")
|
||||||
|
|
||||||
# For the moment, we're assuming that we are the only primary worker
|
# For the moment, we're assuming that we are the only primary worker
|
||||||
# that should be running.
|
# that should be running.
|
||||||
# TODO: maybe check for or clean up another zombie primary worker if we detect it
|
# TODO: maybe check for or clean up another zombie primary worker if we detect it
|
||||||
|
@ -1061,6 +1061,10 @@ def connector_indexing_proxy_task(
|
|||||||
# Track the last time memory info was emitted
|
# Track the last time memory info was emitted
|
||||||
last_memory_emit_time = 0.0
|
last_memory_emit_time = 0.0
|
||||||
|
|
||||||
|
# track the last ttl and the time it was observed
|
||||||
|
last_activity_ttl_observed: float = time.monotonic()
|
||||||
|
last_activity_ttl: int = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with get_session_with_current_tenant() as db_session:
|
with get_session_with_current_tenant() as db_session:
|
||||||
index_attempt = get_index_attempt(
|
index_attempt = get_index_attempt(
|
||||||
@ -1074,11 +1078,15 @@ def connector_indexing_proxy_task(
|
|||||||
)
|
)
|
||||||
|
|
||||||
redis_connector_index.set_active() # renew active signal
|
redis_connector_index.set_active() # renew active signal
|
||||||
redis_connector_index.set_connector_active() # prime the connective active signal
|
|
||||||
|
# prime the connector active signal (renewed inside the connector)
|
||||||
|
redis_connector_index.set_connector_active()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
sleep(5)
|
sleep(5)
|
||||||
|
|
||||||
|
now = time.monotonic()
|
||||||
|
|
||||||
# renew watchdog signal (this has a shorter timeout than set_active)
|
# renew watchdog signal (this has a shorter timeout than set_active)
|
||||||
redis_connector_index.set_watchdog(True)
|
redis_connector_index.set_watchdog(True)
|
||||||
|
|
||||||
@ -1128,18 +1136,37 @@ def connector_indexing_proxy_task(
|
|||||||
break
|
break
|
||||||
|
|
||||||
# if activity timeout is detected, break (exit point will clean up)
|
# if activity timeout is detected, break (exit point will clean up)
|
||||||
if not redis_connector_index.connector_active():
|
ttl = redis_connector_index.connector_active_ttl()
|
||||||
task_logger.warning(
|
if ttl < 0:
|
||||||
log_builder.build(
|
# verify expectations around ttl
|
||||||
"Indexing watchdog - activity timeout exceeded",
|
last_observed = last_activity_ttl_observed - now
|
||||||
timeout=f"{CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT}s",
|
if now > last_activity_ttl_observed + last_activity_ttl:
|
||||||
|
task_logger.warning(
|
||||||
|
log_builder.build(
|
||||||
|
"Indexing watchdog - activity timeout exceeded",
|
||||||
|
last_observed=f"{last_observed:.2f}s",
|
||||||
|
last_ttl=f"{last_activity_ttl}",
|
||||||
|
timeout=f"{CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT}s",
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
result.status = (
|
result.status = (
|
||||||
IndexingWatchdogTerminalStatus.TERMINATED_BY_ACTIVITY_TIMEOUT
|
IndexingWatchdogTerminalStatus.TERMINATED_BY_ACTIVITY_TIMEOUT
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
task_logger.warning(
|
||||||
|
log_builder.build(
|
||||||
|
"Indexing watchdog - activity timeout expired unexpectedly, "
|
||||||
|
"waiting for last observed TTL before exiting",
|
||||||
|
last_observed=f"{last_observed:.2f}s",
|
||||||
|
last_ttl=f"{last_activity_ttl}",
|
||||||
|
timeout=f"{CELERY_INDEXING_WATCHDOG_CONNECTOR_TIMEOUT}s",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
last_activity_ttl_observed = now
|
||||||
|
last_activity_ttl = ttl
|
||||||
|
|
||||||
# if the spawned task is still running, restart the check once again
|
# if the spawned task is still running, restart the check once again
|
||||||
# if the index attempt is not in a finished status
|
# if the index attempt is not in a finished status
|
||||||
|
@ -255,7 +255,6 @@ def default_msg_filter(message: MessageType) -> bool:
|
|||||||
# Don't keep messages from bots
|
# Don't keep messages from bots
|
||||||
if message.get("bot_id") or message.get("app_id"):
|
if message.get("bot_id") or message.get("app_id"):
|
||||||
bot_profile_name = message.get("bot_profile", {}).get("name")
|
bot_profile_name = message.get("bot_profile", {}).get("name")
|
||||||
print(f"bot_profile_name: {bot_profile_name}")
|
|
||||||
if bot_profile_name == "DanswerBot Testing":
|
if bot_profile_name == "DanswerBot Testing":
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
@ -165,6 +165,16 @@ class RedisConnectorIndex:
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def connector_active_ttl(self) -> int:
|
||||||
|
"""Refer to https://redis.io/docs/latest/commands/ttl/
|
||||||
|
|
||||||
|
-2 means the key does not exist
|
||||||
|
-1 means the key exists but has no associated expire
|
||||||
|
Otherwise, returns the actual TTL of the key
|
||||||
|
"""
|
||||||
|
ttl = cast(int, self.redis.ttl(self.connector_active_key))
|
||||||
|
return ttl
|
||||||
|
|
||||||
def generator_locked(self) -> bool:
|
def generator_locked(self) -> bool:
|
||||||
return bool(self.redis.exists(self.generator_lock_key))
|
return bool(self.redis.exists(self.generator_lock_key))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user