mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-05 04:31:03 +02:00
Indexing latency check fix (#3747)
* add logs + update dev script * update conig * remove prints * temporarily turn off * va * update * fix * finalize monitoring updates * update
This commit is contained in:
parent
1613a8ba4f
commit
ccb16b7484
@ -32,6 +32,7 @@ from onyx.db.models import DocumentSet
|
|||||||
from onyx.db.models import IndexAttempt
|
from onyx.db.models import IndexAttempt
|
||||||
from onyx.db.models import SyncRecord
|
from onyx.db.models import SyncRecord
|
||||||
from onyx.db.models import UserGroup
|
from onyx.db.models import UserGroup
|
||||||
|
from onyx.db.search_settings import get_active_search_settings
|
||||||
from onyx.redis.redis_pool import get_redis_client
|
from onyx.redis.redis_pool import get_redis_client
|
||||||
from onyx.redis.redis_pool import redis_lock_dump
|
from onyx.redis.redis_pool import redis_lock_dump
|
||||||
from onyx.utils.telemetry import optional_telemetry
|
from onyx.utils.telemetry import optional_telemetry
|
||||||
@ -184,6 +185,10 @@ def _build_connector_start_latency_metric(
|
|||||||
|
|
||||||
start_latency = (recent_attempt.time_started - desired_start_time).total_seconds()
|
start_latency = (recent_attempt.time_started - desired_start_time).total_seconds()
|
||||||
|
|
||||||
|
task_logger.info(
|
||||||
|
f"Start latency for index attempt {recent_attempt.id}: {start_latency:.2f}s "
|
||||||
|
f"(desired: {desired_start_time}, actual: {recent_attempt.time_started})"
|
||||||
|
)
|
||||||
return Metric(
|
return Metric(
|
||||||
key=metric_key,
|
key=metric_key,
|
||||||
name="connector_start_latency",
|
name="connector_start_latency",
|
||||||
@ -217,6 +222,9 @@ def _build_run_success_metrics(
|
|||||||
IndexingStatus.FAILED,
|
IndexingStatus.FAILED,
|
||||||
IndexingStatus.CANCELED,
|
IndexingStatus.CANCELED,
|
||||||
]:
|
]:
|
||||||
|
task_logger.info(
|
||||||
|
f"Adding run success metric for index attempt {attempt.id} with status {attempt.status}"
|
||||||
|
)
|
||||||
metrics.append(
|
metrics.append(
|
||||||
Metric(
|
Metric(
|
||||||
key=metric_key,
|
key=metric_key,
|
||||||
@ -237,25 +245,29 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
|
|||||||
# Get all connector credential pairs
|
# Get all connector credential pairs
|
||||||
cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
|
cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
|
||||||
|
|
||||||
|
active_search_settings = get_active_search_settings(db_session)
|
||||||
metrics = []
|
metrics = []
|
||||||
for cc_pair in cc_pairs:
|
|
||||||
# Get all attempts in the last hour
|
for cc_pair, search_settings in zip(cc_pairs, active_search_settings):
|
||||||
recent_attempts = (
|
recent_attempts = (
|
||||||
db_session.query(IndexAttempt)
|
db_session.query(IndexAttempt)
|
||||||
.filter(
|
.filter(
|
||||||
IndexAttempt.connector_credential_pair_id == cc_pair.id,
|
IndexAttempt.connector_credential_pair_id == cc_pair.id,
|
||||||
IndexAttempt.time_created >= one_hour_ago,
|
IndexAttempt.search_settings_id == search_settings.id,
|
||||||
)
|
)
|
||||||
.order_by(IndexAttempt.time_created.desc())
|
.order_by(IndexAttempt.time_created.desc())
|
||||||
|
.limit(2)
|
||||||
.all()
|
.all()
|
||||||
)
|
)
|
||||||
most_recent_attempt = recent_attempts[0] if recent_attempts else None
|
if not recent_attempts:
|
||||||
|
continue
|
||||||
|
|
||||||
|
most_recent_attempt = recent_attempts[0]
|
||||||
second_most_recent_attempt = (
|
second_most_recent_attempt = (
|
||||||
recent_attempts[1] if len(recent_attempts) > 1 else None
|
recent_attempts[1] if len(recent_attempts) > 1 else None
|
||||||
)
|
)
|
||||||
|
|
||||||
# if no metric to emit, skip
|
if one_hour_ago > most_recent_attempt.time_created:
|
||||||
if most_recent_attempt is None:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Connector start latency
|
# Connector start latency
|
||||||
@ -298,7 +310,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
|||||||
f"{sync_record.entity_id}:{sync_record.id}"
|
f"{sync_record.entity_id}:{sync_record.id}"
|
||||||
)
|
)
|
||||||
if _has_metric_been_emitted(redis_std, metric_key):
|
if _has_metric_been_emitted(redis_std, metric_key):
|
||||||
task_logger.debug(
|
task_logger.info(
|
||||||
f"Skipping metric for sync record {sync_record.id} "
|
f"Skipping metric for sync record {sync_record.id} "
|
||||||
"because it has already been emitted"
|
"because it has already been emitted"
|
||||||
)
|
)
|
||||||
@ -318,11 +330,15 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
|||||||
|
|
||||||
if sync_speed is None:
|
if sync_speed is None:
|
||||||
task_logger.error(
|
task_logger.error(
|
||||||
"Something went wrong with sync speed calculation. "
|
f"Something went wrong with sync speed calculation. "
|
||||||
f"Sync record: {sync_record.id}"
|
f"Sync record: {sync_record.id}, duration: {sync_duration_mins}, "
|
||||||
|
f"docs synced: {sync_record.num_docs_synced}"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
task_logger.info(
|
||||||
|
f"Calculated sync speed for record {sync_record.id}: {sync_speed} docs/min"
|
||||||
|
)
|
||||||
metrics.append(
|
metrics.append(
|
||||||
Metric(
|
Metric(
|
||||||
key=metric_key,
|
key=metric_key,
|
||||||
@ -341,7 +357,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
|||||||
f":{sync_record.entity_id}:{sync_record.id}"
|
f":{sync_record.entity_id}:{sync_record.id}"
|
||||||
)
|
)
|
||||||
if _has_metric_been_emitted(redis_std, start_latency_key):
|
if _has_metric_been_emitted(redis_std, start_latency_key):
|
||||||
task_logger.debug(
|
task_logger.info(
|
||||||
f"Skipping start latency metric for sync record {sync_record.id} "
|
f"Skipping start latency metric for sync record {sync_record.id} "
|
||||||
"because it has already been emitted"
|
"because it has already been emitted"
|
||||||
)
|
)
|
||||||
@ -359,7 +375,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Skip other sync types
|
# Skip other sync types
|
||||||
task_logger.debug(
|
task_logger.info(
|
||||||
f"Skipping sync record {sync_record.id} "
|
f"Skipping sync record {sync_record.id} "
|
||||||
f"with type {sync_record.sync_type} "
|
f"with type {sync_record.sync_type} "
|
||||||
f"and id {sync_record.entity_id} "
|
f"and id {sync_record.entity_id} "
|
||||||
@ -378,12 +394,15 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
|||||||
start_latency = (
|
start_latency = (
|
||||||
sync_record.sync_start_time - entity.time_last_modified_by_user
|
sync_record.sync_start_time - entity.time_last_modified_by_user
|
||||||
).total_seconds()
|
).total_seconds()
|
||||||
|
task_logger.info(
|
||||||
|
f"Calculated start latency for sync record {sync_record.id}: {start_latency} seconds"
|
||||||
|
)
|
||||||
if start_latency < 0:
|
if start_latency < 0:
|
||||||
task_logger.error(
|
task_logger.error(
|
||||||
f"Start latency is negative for sync record {sync_record.id} "
|
f"Start latency is negative for sync record {sync_record.id} "
|
||||||
f"with type {sync_record.sync_type} and id {sync_record.entity_id}."
|
f"with type {sync_record.sync_type} and id {sync_record.entity_id}. "
|
||||||
"This is likely because the entity was updated between the time the "
|
f"Sync start time: {sync_record.sync_start_time}, "
|
||||||
"time the sync finished and this job ran. Skipping."
|
f"Entity last modified: {entity.time_last_modified_by_user}"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -35,12 +35,16 @@ class LongTermLogger:
|
|||||||
def _cleanup_old_files(self, category_path: Path) -> None:
|
def _cleanup_old_files(self, category_path: Path) -> None:
|
||||||
try:
|
try:
|
||||||
files = sorted(
|
files = sorted(
|
||||||
[f for f in category_path.glob("*.json") if f.is_file()],
|
[f for f in category_path.glob("*.json")],
|
||||||
key=lambda x: x.stat().st_mtime, # Sort by modification time
|
key=lambda x: x.stat().st_mtime, # Sort by modification time
|
||||||
reverse=True,
|
reverse=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Delete oldest files that exceed the limit
|
# Delete oldest files that exceed the limit
|
||||||
for file in files[self.max_files_per_category :]:
|
for file in files[self.max_files_per_category :]:
|
||||||
|
if not file.is_file():
|
||||||
|
logger.debug(f"File already deleted: {file}")
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
file.unlink()
|
file.unlink()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -72,6 +72,19 @@ def run_jobs() -> None:
|
|||||||
"--queues=connector_indexing",
|
"--queues=connector_indexing",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
cmd_worker_monitoring = [
|
||||||
|
"celery",
|
||||||
|
"-A",
|
||||||
|
"onyx.background.celery.versioned_apps.monitoring",
|
||||||
|
"worker",
|
||||||
|
"--pool=threads",
|
||||||
|
"--concurrency=1",
|
||||||
|
"--prefetch-multiplier=1",
|
||||||
|
"--loglevel=INFO",
|
||||||
|
"--hostname=monitoring@%n",
|
||||||
|
"--queues=monitoring",
|
||||||
|
]
|
||||||
|
|
||||||
cmd_beat = [
|
cmd_beat = [
|
||||||
"celery",
|
"celery",
|
||||||
"-A",
|
"-A",
|
||||||
@ -97,6 +110,13 @@ def run_jobs() -> None:
|
|||||||
cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
worker_monitoring_process = subprocess.Popen(
|
||||||
|
cmd_worker_monitoring,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
|
||||||
beat_process = subprocess.Popen(
|
beat_process = subprocess.Popen(
|
||||||
cmd_beat, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
cmd_beat, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
||||||
)
|
)
|
||||||
@ -114,18 +134,23 @@ def run_jobs() -> None:
|
|||||||
worker_indexing_thread = threading.Thread(
|
worker_indexing_thread = threading.Thread(
|
||||||
target=monitor_process, args=("INDEX", worker_indexing_process)
|
target=monitor_process, args=("INDEX", worker_indexing_process)
|
||||||
)
|
)
|
||||||
|
worker_monitoring_thread = threading.Thread(
|
||||||
|
target=monitor_process, args=("MONITORING", worker_monitoring_process)
|
||||||
|
)
|
||||||
beat_thread = threading.Thread(target=monitor_process, args=("BEAT", beat_process))
|
beat_thread = threading.Thread(target=monitor_process, args=("BEAT", beat_process))
|
||||||
|
|
||||||
worker_primary_thread.start()
|
worker_primary_thread.start()
|
||||||
worker_light_thread.start()
|
worker_light_thread.start()
|
||||||
worker_heavy_thread.start()
|
worker_heavy_thread.start()
|
||||||
worker_indexing_thread.start()
|
worker_indexing_thread.start()
|
||||||
|
worker_monitoring_thread.start()
|
||||||
beat_thread.start()
|
beat_thread.start()
|
||||||
|
|
||||||
worker_primary_thread.join()
|
worker_primary_thread.join()
|
||||||
worker_light_thread.join()
|
worker_light_thread.join()
|
||||||
worker_heavy_thread.join()
|
worker_heavy_thread.join()
|
||||||
worker_indexing_thread.join()
|
worker_indexing_thread.join()
|
||||||
|
worker_monitoring_thread.join()
|
||||||
beat_thread.join()
|
beat_thread.join()
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user