Indexing latency check fix (#3747)

* add logs + update dev script * update conig * remove prints * temporarily turn off * va * update * fix * finalize monitoring updates * update
2025-06-05 04:31:03 +02:00 · 2025-01-23 09:14:26 -08:00 · 2025-01-23 09:14:26 -08:00 · ccb16b7484
commit ccb16b7484
parent 1613a8ba4f
3 changed files with 63 additions and 15 deletions
--- a/backend/onyx/background/celery/tasks/monitoring/tasks.py
+++ b/backend/onyx/background/celery/tasks/monitoring/tasks.py
@ -32,6 +32,7 @@ from onyx.db.models import DocumentSet
 from onyx.db.models import IndexAttempt
 from onyx.db.models import SyncRecord
 from onyx.db.models import UserGroup
 from onyx.db.search_settings import get_active_search_settings
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import redis_lock_dump
 from onyx.utils.telemetry import optional_telemetry
@ -184,6 +185,10 @@ def _build_connector_start_latency_metric(
    start_latency = (recent_attempt.time_started - desired_start_time).total_seconds()
    task_logger.info(
        f"Start latency for index attempt {recent_attempt.id}: {start_latency:.2f}s "
        f"(desired: {desired_start_time}, actual: {recent_attempt.time_started})"
    )
    return Metric(
        key=metric_key,
        name="connector_start_latency",
@ -217,6 +222,9 @@ def _build_run_success_metrics(
            IndexingStatus.FAILED,
            IndexingStatus.CANCELED,
        ]:
            task_logger.info(
                f"Adding run success metric for index attempt {attempt.id} with status {attempt.status}"
            )
            metrics.append(
                Metric(
                    key=metric_key,
@ -237,25 +245,29 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
    # Get all connector credential pairs
    cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
    active_search_settings = get_active_search_settings(db_session)
    metrics = []
-    for cc_pair in cc_pairs:
+
-        # Get all attempts in the last hour
+    for cc_pair, search_settings in zip(cc_pairs, active_search_settings):
        recent_attempts = (
            db_session.query(IndexAttempt)
            .filter(
                IndexAttempt.connector_credential_pair_id == cc_pair.id,
-                IndexAttempt.time_created >= one_hour_ago,
+                IndexAttempt.search_settings_id == search_settings.id,
            )
            .order_by(IndexAttempt.time_created.desc())
            .limit(2)
            .all()
        )
-        most_recent_attempt = recent_attempts[0] if recent_attempts else None
+        if not recent_attempts:
            continue
        most_recent_attempt = recent_attempts[0]
        second_most_recent_attempt = (
            recent_attempts[1] if len(recent_attempts) > 1 else None
        )
-        # if no metric to emit, skip
+        if one_hour_ago > most_recent_attempt.time_created:
        if most_recent_attempt is None:
            continue
        # Connector start latency
@ -298,7 +310,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
            f"{sync_record.entity_id}:{sync_record.id}"
        )
        if _has_metric_been_emitted(redis_std, metric_key):
-            task_logger.debug(
+            task_logger.info(
                f"Skipping metric for sync record {sync_record.id} "
                "because it has already been emitted"
            )
@ -318,11 +330,15 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
        if sync_speed is None:
            task_logger.error(
-                "Something went wrong with sync speed calculation. "
+                f"Something went wrong with sync speed calculation. "
-                f"Sync record: {sync_record.id}"
+                f"Sync record: {sync_record.id}, duration: {sync_duration_mins}, "
                f"docs synced: {sync_record.num_docs_synced}"
            )
            continue
        task_logger.info(
            f"Calculated sync speed for record {sync_record.id}: {sync_speed} docs/min"
        )
        metrics.append(
            Metric(
                key=metric_key,
@ -341,7 +357,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
            f":{sync_record.entity_id}:{sync_record.id}"
        )
        if _has_metric_been_emitted(redis_std, start_latency_key):
-            task_logger.debug(
+            task_logger.info(
                f"Skipping start latency metric for sync record {sync_record.id} "
                "because it has already been emitted"
            )
@ -359,7 +375,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
            )
        else:
            # Skip other sync types
-            task_logger.debug(
+            task_logger.info(
                f"Skipping sync record {sync_record.id} "
                f"with type {sync_record.sync_type} "
                f"and id {sync_record.entity_id} "
@ -378,12 +394,15 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
        start_latency = (
            sync_record.sync_start_time - entity.time_last_modified_by_user
        ).total_seconds()
        task_logger.info(
            f"Calculated start latency for sync record {sync_record.id}: {start_latency} seconds"
        )
        if start_latency < 0:
            task_logger.error(
                f"Start latency is negative for sync record {sync_record.id} "
-                f"with type {sync_record.sync_type} and id {sync_record.entity_id}."
+                f"with type {sync_record.sync_type} and id {sync_record.entity_id}. "
-                "This is likely because the entity was updated between the time the "
+                f"Sync start time: {sync_record.sync_start_time}, "
-                "time the sync finished and this job ran. Skipping."
+                f"Entity last modified: {entity.time_last_modified_by_user}"
            )
            continue
--- a/backend/onyx/utils/long_term_log.py
+++ b/backend/onyx/utils/long_term_log.py
@ -35,12 +35,16 @@ class LongTermLogger:
    def _cleanup_old_files(self, category_path: Path) -> None:
        try:
            files = sorted(
-                [f for f in category_path.glob("*.json") if f.is_file()],
+                [f for f in category_path.glob("*.json")],
                key=lambda x: x.stat().st_mtime,  # Sort by modification time
                reverse=True,
            )
            # Delete oldest files that exceed the limit
            for file in files[self.max_files_per_category :]:
                if not file.is_file():
                    logger.debug(f"File already deleted: {file}")
                    continue
                try:
                    file.unlink()
                except Exception as e:
--- a/backend/scripts/dev_run_background_jobs.py
+++ b/backend/scripts/dev_run_background_jobs.py
@ -72,6 +72,19 @@ def run_jobs() -> None:
        "--queues=connector_indexing",
    ]
    cmd_worker_monitoring = [
        "celery",
        "-A",
        "onyx.background.celery.versioned_apps.monitoring",
        "worker",
        "--pool=threads",
        "--concurrency=1",
        "--prefetch-multiplier=1",
        "--loglevel=INFO",
        "--hostname=monitoring@%n",
        "--queues=monitoring",
    ]
    cmd_beat = [
        "celery",
        "-A",
@ -97,6 +110,13 @@ def run_jobs() -> None:
        cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    worker_monitoring_process = subprocess.Popen(
        cmd_worker_monitoring,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
    )
    beat_process = subprocess.Popen(
        cmd_beat, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
@ -114,18 +134,23 @@ def run_jobs() -> None:
    worker_indexing_thread = threading.Thread(
        target=monitor_process, args=("INDEX", worker_indexing_process)
    )
    worker_monitoring_thread = threading.Thread(
        target=monitor_process, args=("MONITORING", worker_monitoring_process)
    )
    beat_thread = threading.Thread(target=monitor_process, args=("BEAT", beat_process))
    worker_primary_thread.start()
    worker_light_thread.start()
    worker_heavy_thread.start()
    worker_indexing_thread.start()
    worker_monitoring_thread.start()
    beat_thread.start()
    worker_primary_thread.join()
    worker_light_thread.join()
    worker_heavy_thread.join()
    worker_indexing_thread.join()
    worker_monitoring_thread.join()
    beat_thread.join()