Sync status improvements (#3782)

* minor improvments / clarity

* additional comment for clarity

* typing

* quick updates to monitoring

* connector deletion

* quick nit

* fix typing

* update values

* quick nit

* functioning

* improvements to monitoring

* update

* minutes -> seconds
This commit is contained in:
pablonyx
2025-01-26 09:35:26 -08:00
committed by GitHub
parent d8a17a7238
commit 70795a4047
6 changed files with 355 additions and 164 deletions

View File

@@ -227,7 +227,7 @@ if not MULTI_TENANT:
{ {
"name": "monitor-background-processes", "name": "monitor-background-processes",
"task": OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES, "task": OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
"schedule": timedelta(minutes=5), "schedule": timedelta(minutes=15),
"options": { "options": {
"priority": OnyxCeleryPriority.LOW, "priority": OnyxCeleryPriority.LOW,
"expires": BEAT_EXPIRES_DEFAULT, "expires": BEAT_EXPIRES_DEFAULT,

View File

@@ -139,13 +139,6 @@ def try_generate_document_cc_pair_cleanup_tasks(
submitted=datetime.now(timezone.utc), submitted=datetime.now(timezone.utc),
) )
# create before setting fence to avoid race condition where the monitoring
# task updates the sync record before it is created
insert_sync_record(
db_session=db_session,
entity_id=cc_pair_id,
sync_type=SyncType.CONNECTOR_DELETION,
)
redis_connector.delete.set_fence(fence_payload) redis_connector.delete.set_fence(fence_payload)
try: try:
@@ -184,6 +177,13 @@ def try_generate_document_cc_pair_cleanup_tasks(
) )
if tasks_generated is None: if tasks_generated is None:
raise ValueError("RedisConnectorDeletion.generate_tasks returned None") raise ValueError("RedisConnectorDeletion.generate_tasks returned None")
insert_sync_record(
db_session=db_session,
entity_id=cc_pair_id,
sync_type=SyncType.CONNECTOR_DELETION,
)
except TaskDependencyError: except TaskDependencyError:
redis_connector.delete.set_fence(None) redis_connector.delete.set_fence(None)
raise raise

View File

@@ -4,6 +4,7 @@ from collections.abc import Callable
from datetime import timedelta from datetime import timedelta
from itertools import islice from itertools import islice
from typing import Any from typing import Any
from typing import Literal
from celery import shared_task from celery import shared_task
from celery import Task from celery import Task
@@ -26,6 +27,7 @@ from onyx.db.engine import get_all_tenant_ids
from onyx.db.engine import get_db_current_time from onyx.db.engine import get_db_current_time
from onyx.db.engine import get_session_with_tenant from onyx.db.engine import get_session_with_tenant
from onyx.db.enums import IndexingStatus from onyx.db.enums import IndexingStatus
from onyx.db.enums import SyncStatus
from onyx.db.enums import SyncType from onyx.db.enums import SyncType
from onyx.db.models import ConnectorCredentialPair from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import DocumentSet from onyx.db.models import DocumentSet
@@ -38,6 +40,7 @@ from onyx.redis.redis_pool import redis_lock_dump
from onyx.utils.telemetry import optional_telemetry from onyx.utils.telemetry import optional_telemetry
from onyx.utils.telemetry import RecordType from onyx.utils.telemetry import RecordType
_MONITORING_SOFT_TIME_LIMIT = 60 * 5 # 5 minutes _MONITORING_SOFT_TIME_LIMIT = 60 * 5 # 5 minutes
_MONITORING_TIME_LIMIT = _MONITORING_SOFT_TIME_LIMIT + 60 # 6 minutes _MONITORING_TIME_LIMIT = _MONITORING_SOFT_TIME_LIMIT + 60 # 6 minutes
@@ -49,6 +52,12 @@ _CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT = (
"monitoring_connector_index_attempt_run_success:{cc_pair_id}:{index_attempt_id}" "monitoring_connector_index_attempt_run_success:{cc_pair_id}:{index_attempt_id}"
) )
_FINAL_METRIC_KEY_FMT = "sync_final_metrics:{sync_type}:{entity_id}:{sync_record_id}"
_SYNC_START_LATENCY_KEY_FMT = (
"sync_start_latency:{sync_type}:{entity_id}:{sync_record_id}"
)
def _mark_metric_as_emitted(redis_std: Redis, key: str) -> None: def _mark_metric_as_emitted(redis_std: Redis, key: str) -> None:
"""Mark a metric as having been emitted by setting a Redis key with expiration""" """Mark a metric as having been emitted by setting a Redis key with expiration"""
@@ -111,6 +120,7 @@ class Metric(BaseModel):
}.items() }.items()
if v is not None if v is not None
} }
task_logger.info(f"Emitting metric: {data}")
optional_telemetry( optional_telemetry(
record_type=RecordType.METRIC, record_type=RecordType.METRIC,
data=data, data=data,
@@ -189,66 +199,129 @@ def _build_connector_start_latency_metric(
f"Start latency for index attempt {recent_attempt.id}: {start_latency:.2f}s " f"Start latency for index attempt {recent_attempt.id}: {start_latency:.2f}s "
f"(desired: {desired_start_time}, actual: {recent_attempt.time_started})" f"(desired: {desired_start_time}, actual: {recent_attempt.time_started})"
) )
job_id = build_job_id("connector", str(cc_pair.id), str(recent_attempt.id))
return Metric( return Metric(
key=metric_key, key=metric_key,
name="connector_start_latency", name="connector_start_latency",
value=start_latency, value=start_latency,
tags={}, tags={
"job_id": job_id,
"connector_id": str(cc_pair.connector.id),
"source": str(cc_pair.connector.source),
},
) )
def _build_run_success_metrics( def _build_connector_final_metrics(
cc_pair: ConnectorCredentialPair, cc_pair: ConnectorCredentialPair,
recent_attempts: list[IndexAttempt], recent_attempts: list[IndexAttempt],
redis_std: Redis, redis_std: Redis,
) -> list[Metric]: ) -> list[Metric]:
"""
Final metrics for connector index attempts:
- Boolean success/fail metric
- If success, emit:
* duration (seconds)
* doc_count
"""
metrics = [] metrics = []
for attempt in recent_attempts: for attempt in recent_attempts:
metric_key = _CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT.format( metric_key = _CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT.format(
cc_pair_id=cc_pair.id, cc_pair_id=cc_pair.id,
index_attempt_id=attempt.id, index_attempt_id=attempt.id,
) )
if _has_metric_been_emitted(redis_std, metric_key): if _has_metric_been_emitted(redis_std, metric_key):
task_logger.info( task_logger.info(
f"Skipping metric for connector {cc_pair.connector.id} " f"Skipping final metrics for connector {cc_pair.connector.id} "
f"index attempt {attempt.id} because it has already been " f"index attempt {attempt.id}, already emitted."
"emitted"
) )
continue continue
if attempt.status in [ # We only emit final metrics if the attempt is in a terminal state
if attempt.status not in [
IndexingStatus.SUCCESS, IndexingStatus.SUCCESS,
IndexingStatus.FAILED, IndexingStatus.FAILED,
IndexingStatus.CANCELED, IndexingStatus.CANCELED,
]: ]:
task_logger.info( # Not finished; skip
f"Adding run success metric for index attempt {attempt.id} with status {attempt.status}" continue
)
job_id = build_job_id("connector", str(cc_pair.id), str(attempt.id))
success = attempt.status == IndexingStatus.SUCCESS
metrics.append( metrics.append(
Metric( Metric(
key=metric_key, key=metric_key, # We'll mark the same key for any final metrics
name="connector_run_succeeded", name="connector_run_succeeded",
value=attempt.status == IndexingStatus.SUCCESS, value=success,
tags={"source": str(cc_pair.connector.source)}, tags={
"job_id": job_id,
"connector_id": str(cc_pair.connector.id),
"source": str(cc_pair.connector.source),
"status": attempt.status.value,
},
) )
) )
if success:
# Make sure we have valid time_started
if attempt.time_started and attempt.time_updated:
duration_seconds = (
attempt.time_updated - attempt.time_started
).total_seconds()
metrics.append(
Metric(
key=None, # No need for a new key, or you can reuse the same if you prefer
name="connector_index_duration_seconds",
value=duration_seconds,
tags={
"job_id": job_id,
"connector_id": str(cc_pair.connector.id),
"source": str(cc_pair.connector.source),
},
)
)
else:
task_logger.error(
f"Index attempt {attempt.id} succeeded but has missing time "
f"(time_started={attempt.time_started}, time_updated={attempt.time_updated})."
)
# For doc counts, choose whichever field is more relevant
doc_count = attempt.total_docs_indexed or 0
metrics.append(
Metric(
key=None,
name="connector_index_doc_count",
value=doc_count,
tags={
"job_id": job_id,
"connector_id": str(cc_pair.connector.id),
"source": str(cc_pair.connector.source),
},
)
)
_mark_metric_as_emitted(redis_std, metric_key)
return metrics return metrics
def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Metric]: def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
"""Collect metrics about connector runs from the past hour""" """Collect metrics about connector runs from the past hour"""
# NOTE: use get_db_current_time since the IndexAttempt times are set based on DB time
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1) one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
# Get all connector credential pairs # Get all connector credential pairs
cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all() cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
# Might be more than one search setting, or just one
active_search_settings = get_active_search_settings(db_session) active_search_settings = get_active_search_settings(db_session)
metrics = [] metrics = []
for cc_pair, search_settings in zip(cc_pairs, active_search_settings): # If you want to process each cc_pair against each search setting:
for cc_pair in cc_pairs:
for search_settings in active_search_settings:
recent_attempts = ( recent_attempts = (
db_session.query(IndexAttempt) db_session.query(IndexAttempt)
.filter( .filter(
@@ -259,6 +332,7 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
.limit(2) .limit(2)
.all() .all()
) )
if not recent_attempts: if not recent_attempts:
continue continue
@@ -274,95 +348,150 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
start_latency_metric = _build_connector_start_latency_metric( start_latency_metric = _build_connector_start_latency_metric(
cc_pair, most_recent_attempt, second_most_recent_attempt, redis_std cc_pair, most_recent_attempt, second_most_recent_attempt, redis_std
) )
if start_latency_metric: if start_latency_metric:
metrics.append(start_latency_metric) metrics.append(start_latency_metric)
# Connector run success/failure # Connector run success/failure
run_success_metrics = _build_run_success_metrics( final_metrics = _build_connector_final_metrics(
cc_pair, recent_attempts, redis_std cc_pair, recent_attempts, redis_std
) )
metrics.extend(run_success_metrics) metrics.extend(final_metrics)
return metrics return metrics
def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]: def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
"""Collect metrics about document set and group syncing speed""" """
# NOTE: use get_db_current_time since the SyncRecord times are set based on DB time Collect metrics for document set and group syncing:
- Success/failure status
- Start latency (always)
- Duration & doc count (only if success)
- Throughput (docs/min) (only if success)
"""
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1) one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
# Get all sync records from the last hour # Get all sync records that ended in the last hour
recent_sync_records = db_session.scalars( recent_sync_records = db_session.scalars(
select(SyncRecord) select(SyncRecord)
.where(SyncRecord.sync_start_time >= one_hour_ago) .where(SyncRecord.sync_end_time.isnot(None))
.order_by(SyncRecord.sync_start_time.desc()) .where(SyncRecord.sync_end_time >= one_hour_ago)
.order_by(SyncRecord.sync_end_time.desc())
).all() ).all()
task_logger.info(
f"Collecting sync metrics for {len(recent_sync_records)} sync records"
)
metrics = [] metrics = []
for sync_record in recent_sync_records: for sync_record in recent_sync_records:
# Skip if no end time (sync still in progress) # Build a job_id for correlation
if not sync_record.sync_end_time: job_id = build_job_id("sync_record", str(sync_record.id))
continue
# Check if we already emitted a metric for this sync record # Emit a SUCCESS/FAIL boolean metric
metric_key = ( # Use a single Redis key to avoid re-emitting final metrics
f"sync_speed:{sync_record.sync_type}:" final_metric_key = _FINAL_METRIC_KEY_FMT.format(
f"{sync_record.entity_id}:{sync_record.id}" sync_type=sync_record.sync_type,
entity_id=sync_record.entity_id,
sync_record_id=sync_record.id,
) )
if _has_metric_been_emitted(redis_std, metric_key): if not _has_metric_been_emitted(redis_std, final_metric_key):
task_logger.info( # Evaluate success
f"Skipping metric for sync record {sync_record.id} " sync_succeeded = sync_record.sync_status == SyncStatus.SUCCESS
"because it has already been emitted"
)
continue
# Calculate sync duration in minutes
sync_duration_mins = (
sync_record.sync_end_time - sync_record.sync_start_time
).total_seconds() / 60.0
# Calculate sync speed (docs/min) - avoid division by zero
sync_speed = (
sync_record.num_docs_synced / sync_duration_mins
if sync_duration_mins > 0
else None
)
if sync_speed is None:
task_logger.error(
f"Something went wrong with sync speed calculation. "
f"Sync record: {sync_record.id}, duration: {sync_duration_mins}, "
f"docs synced: {sync_record.num_docs_synced}"
)
continue
task_logger.info(
f"Calculated sync speed for record {sync_record.id}: {sync_speed} docs/min"
)
metrics.append( metrics.append(
Metric( Metric(
key=metric_key, key=final_metric_key,
name="sync_speed_docs_per_min", name="sync_run_succeeded",
value=sync_speed, value=sync_succeeded,
tags={ tags={
"job_id": job_id,
"sync_type": str(sync_record.sync_type), "sync_type": str(sync_record.sync_type),
"status": str(sync_record.sync_status), "status": str(sync_record.sync_status),
}, },
) )
) )
# Add sync start latency metric # If successful, emit additional metrics
start_latency_key = ( if sync_succeeded:
f"sync_start_latency:{sync_record.sync_type}" if sync_record.sync_end_time and sync_record.sync_start_time:
f":{sync_record.entity_id}:{sync_record.id}" duration_seconds = (
sync_record.sync_end_time - sync_record.sync_start_time
).total_seconds()
else:
task_logger.error(
f"Invalid times for sync record {sync_record.id}: "
f"start={sync_record.sync_start_time}, end={sync_record.sync_end_time}"
) )
if _has_metric_been_emitted(redis_std, start_latency_key): duration_seconds = None
task_logger.info(
f"Skipping start latency metric for sync record {sync_record.id} "
"because it has already been emitted"
)
continue
doc_count = sync_record.num_docs_synced or 0
sync_speed = None
if duration_seconds and duration_seconds > 0:
duration_mins = duration_seconds / 60.0
sync_speed = (
doc_count / duration_mins if duration_mins > 0 else None
)
# Emit duration, doc count, speed
if duration_seconds is not None:
metrics.append(
Metric(
key=None,
name="sync_duration_seconds",
value=duration_seconds,
tags={
"job_id": job_id,
"sync_type": str(sync_record.sync_type),
},
)
)
else:
task_logger.error(
f"Invalid sync record {sync_record.id} with no duration"
)
metrics.append(
Metric(
key=None,
name="sync_doc_count",
value=doc_count,
tags={
"job_id": job_id,
"sync_type": str(sync_record.sync_type),
},
)
)
if sync_speed is not None:
metrics.append(
Metric(
key=None,
name="sync_speed_docs_per_min",
value=sync_speed,
tags={
"job_id": job_id,
"sync_type": str(sync_record.sync_type),
},
)
)
else:
task_logger.error(
f"Invalid sync record {sync_record.id} with no duration"
)
# Mark final metrics as emitted so we don't re-emit
_mark_metric_as_emitted(redis_std, final_metric_key)
# Emit start latency
start_latency_key = _SYNC_START_LATENCY_KEY_FMT.format(
sync_type=sync_record.sync_type,
entity_id=sync_record.entity_id,
sync_record_id=sync_record.id,
)
if not _has_metric_been_emitted(redis_std, start_latency_key):
# Get the entity's last update time based on sync type # Get the entity's last update time based on sync type
entity: DocumentSet | UserGroup | None = None entity: DocumentSet | UserGroup | None = None
if sync_record.sync_type == SyncType.DOCUMENT_SET: if sync_record.sync_type == SyncType.DOCUMENT_SET:
@@ -374,35 +503,29 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
select(UserGroup).where(UserGroup.id == sync_record.entity_id) select(UserGroup).where(UserGroup.id == sync_record.entity_id)
) )
else: else:
# Skip other sync types
task_logger.info( task_logger.info(
f"Skipping sync record {sync_record.id} " f"Skipping sync record {sync_record.id} of type {sync_record.sync_type}."
f"with type {sync_record.sync_type} "
f"and id {sync_record.entity_id} "
"because it is not a document set or user group"
) )
continue continue
if entity is None: if entity is None:
task_logger.error( task_logger.error(
f"Could not find entity for sync record {sync_record.id} " f"Could not find entity for sync record {sync_record.id} "
f"with type {sync_record.sync_type} and id {sync_record.entity_id}" f"(type={sync_record.sync_type}, id={sync_record.entity_id})."
) )
continue continue
# Calculate start latency in seconds # Calculate start latency in seconds:
# (actual sync start) - (last modified time)
if entity.time_last_modified_by_user and sync_record.sync_start_time:
start_latency = ( start_latency = (
sync_record.sync_start_time - entity.time_last_modified_by_user sync_record.sync_start_time - entity.time_last_modified_by_user
).total_seconds() ).total_seconds()
task_logger.info(
f"Calculated start latency for sync record {sync_record.id}: {start_latency} seconds"
)
if start_latency < 0: if start_latency < 0:
task_logger.error( task_logger.error(
f"Start latency is negative for sync record {sync_record.id} " f"Negative start latency for sync record {sync_record.id} "
f"with type {sync_record.sync_type} and id {sync_record.entity_id}. " f"(start={sync_record.sync_start_time}, entity_modified={entity.time_last_modified_by_user})"
f"Sync start time: {sync_record.sync_start_time}, "
f"Entity last modified: {entity.time_last_modified_by_user}"
) )
continue continue
@@ -412,14 +535,32 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
name="sync_start_latency_seconds", name="sync_start_latency_seconds",
value=start_latency, value=start_latency,
tags={ tags={
"job_id": job_id,
"sync_type": str(sync_record.sync_type), "sync_type": str(sync_record.sync_type),
}, },
) )
) )
_mark_metric_as_emitted(redis_std, start_latency_key)
return metrics return metrics
def build_job_id(
job_type: Literal["connector", "sync_record"],
primary_id: str,
secondary_id: str | None = None,
) -> str:
if job_type == "connector":
if secondary_id is None:
raise ValueError(
"secondary_id (attempt_id) is required for connector job_type"
)
return f"connector:{primary_id}:attempt:{secondary_id}"
elif job_type == "sync_record":
return f"sync_record:{primary_id}"
@shared_task( @shared_task(
name=OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES, name=OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
soft_time_limit=_MONITORING_SOFT_TIME_LIMIT, soft_time_limit=_MONITORING_SOFT_TIME_LIMIT,
@@ -459,6 +600,7 @@ def monitor_background_processes(self: Task, *, tenant_id: str | None) -> None:
lambda: _collect_connector_metrics(db_session, redis_std), lambda: _collect_connector_metrics(db_session, redis_std),
lambda: _collect_sync_metrics(db_session, redis_std), lambda: _collect_sync_metrics(db_session, redis_std),
] ]
# Collect and log each metric # Collect and log each metric
with get_session_with_tenant(tenant_id) as db_session: with get_session_with_tenant(tenant_id) as db_session:
for metric_fn in metric_functions: for metric_fn in metric_functions:

View File

@@ -8,20 +8,64 @@ from sqlalchemy.orm import Session
from onyx.db.enums import SyncStatus from onyx.db.enums import SyncStatus
from onyx.db.enums import SyncType from onyx.db.enums import SyncType
from onyx.db.models import SyncRecord from onyx.db.models import SyncRecord
from onyx.setup import setup_logger
logger = setup_logger()
def insert_sync_record( def insert_sync_record(
db_session: Session, db_session: Session,
entity_id: int | None, entity_id: int,
sync_type: SyncType, sync_type: SyncType,
) -> SyncRecord: ) -> SyncRecord:
"""Insert a new sync record into the database. """Insert a new sync record into the database, cancelling any existing in-progress records.
Args: Args:
db_session: The database session to use db_session: The database session to use
entity_id: The ID of the entity being synced (document set ID, user group ID, etc.) entity_id: The ID of the entity being synced (document set ID, user group ID, etc.)
sync_type: The type of sync operation sync_type: The type of sync operation
""" """
# If an existing in-progress sync record exists, mark as cancelled
existing_in_progress_sync_record = fetch_latest_sync_record(
db_session, entity_id, sync_type, sync_status=SyncStatus.IN_PROGRESS
)
if existing_in_progress_sync_record is not None:
logger.info(
f"Cancelling existing in-progress sync record {existing_in_progress_sync_record.id} "
f"for entity_id={entity_id} sync_type={sync_type}"
)
mark_sync_records_as_cancelled(db_session, entity_id, sync_type)
return _create_sync_record(db_session, entity_id, sync_type)
def mark_sync_records_as_cancelled(
db_session: Session,
entity_id: int | None,
sync_type: SyncType,
) -> None:
stmt = (
update(SyncRecord)
.where(
and_(
SyncRecord.entity_id == entity_id,
SyncRecord.sync_type == sync_type,
SyncRecord.sync_status == SyncStatus.IN_PROGRESS,
)
)
.values(sync_status=SyncStatus.CANCELED)
)
db_session.execute(stmt)
db_session.commit()
def _create_sync_record(
db_session: Session,
entity_id: int | None,
sync_type: SyncType,
) -> SyncRecord:
"""Create and insert a new sync record into the database."""
sync_record = SyncRecord( sync_record = SyncRecord(
entity_id=entity_id, entity_id=entity_id,
sync_type=sync_type, sync_type=sync_type,
@@ -39,6 +83,7 @@ def fetch_latest_sync_record(
db_session: Session, db_session: Session,
entity_id: int, entity_id: int,
sync_type: SyncType, sync_type: SyncType,
sync_status: SyncStatus | None = None,
) -> SyncRecord | None: ) -> SyncRecord | None:
"""Fetch the most recent sync record for a given entity ID and status. """Fetch the most recent sync record for a given entity ID and status.
@@ -59,6 +104,9 @@ def fetch_latest_sync_record(
.limit(1) .limit(1)
) )
if sync_status is not None:
stmt = stmt.where(SyncRecord.sync_status == sync_status)
result = db_session.execute(stmt) result = db_session.execute(stmt)
return result.scalar_one_or_none() return result.scalar_one_or_none()

View File

@@ -123,6 +123,7 @@ def optional_telemetry(
headers={"Content-Type": "application/json"}, headers={"Content-Type": "application/json"},
json=payload, json=payload,
) )
except Exception: except Exception:
# This way it silences all thread level logging as well # This way it silences all thread level logging as well
pass pass

View File