mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 12:29:41 +02:00
Sync status improvements (#3782)
* minor improvments / clarity * additional comment for clarity * typing * quick updates to monitoring * connector deletion * quick nit * fix typing * update values * quick nit * functioning * improvements to monitoring * update * minutes -> seconds
This commit is contained in:
@@ -227,7 +227,7 @@ if not MULTI_TENANT:
|
|||||||
{
|
{
|
||||||
"name": "monitor-background-processes",
|
"name": "monitor-background-processes",
|
||||||
"task": OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
|
"task": OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
|
||||||
"schedule": timedelta(minutes=5),
|
"schedule": timedelta(minutes=15),
|
||||||
"options": {
|
"options": {
|
||||||
"priority": OnyxCeleryPriority.LOW,
|
"priority": OnyxCeleryPriority.LOW,
|
||||||
"expires": BEAT_EXPIRES_DEFAULT,
|
"expires": BEAT_EXPIRES_DEFAULT,
|
||||||
|
@@ -139,13 +139,6 @@ def try_generate_document_cc_pair_cleanup_tasks(
|
|||||||
submitted=datetime.now(timezone.utc),
|
submitted=datetime.now(timezone.utc),
|
||||||
)
|
)
|
||||||
|
|
||||||
# create before setting fence to avoid race condition where the monitoring
|
|
||||||
# task updates the sync record before it is created
|
|
||||||
insert_sync_record(
|
|
||||||
db_session=db_session,
|
|
||||||
entity_id=cc_pair_id,
|
|
||||||
sync_type=SyncType.CONNECTOR_DELETION,
|
|
||||||
)
|
|
||||||
redis_connector.delete.set_fence(fence_payload)
|
redis_connector.delete.set_fence(fence_payload)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -184,6 +177,13 @@ def try_generate_document_cc_pair_cleanup_tasks(
|
|||||||
)
|
)
|
||||||
if tasks_generated is None:
|
if tasks_generated is None:
|
||||||
raise ValueError("RedisConnectorDeletion.generate_tasks returned None")
|
raise ValueError("RedisConnectorDeletion.generate_tasks returned None")
|
||||||
|
|
||||||
|
insert_sync_record(
|
||||||
|
db_session=db_session,
|
||||||
|
entity_id=cc_pair_id,
|
||||||
|
sync_type=SyncType.CONNECTOR_DELETION,
|
||||||
|
)
|
||||||
|
|
||||||
except TaskDependencyError:
|
except TaskDependencyError:
|
||||||
redis_connector.delete.set_fence(None)
|
redis_connector.delete.set_fence(None)
|
||||||
raise
|
raise
|
||||||
|
@@ -4,6 +4,7 @@ from collections.abc import Callable
|
|||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from celery import Task
|
from celery import Task
|
||||||
@@ -26,6 +27,7 @@ from onyx.db.engine import get_all_tenant_ids
|
|||||||
from onyx.db.engine import get_db_current_time
|
from onyx.db.engine import get_db_current_time
|
||||||
from onyx.db.engine import get_session_with_tenant
|
from onyx.db.engine import get_session_with_tenant
|
||||||
from onyx.db.enums import IndexingStatus
|
from onyx.db.enums import IndexingStatus
|
||||||
|
from onyx.db.enums import SyncStatus
|
||||||
from onyx.db.enums import SyncType
|
from onyx.db.enums import SyncType
|
||||||
from onyx.db.models import ConnectorCredentialPair
|
from onyx.db.models import ConnectorCredentialPair
|
||||||
from onyx.db.models import DocumentSet
|
from onyx.db.models import DocumentSet
|
||||||
@@ -38,6 +40,7 @@ from onyx.redis.redis_pool import redis_lock_dump
|
|||||||
from onyx.utils.telemetry import optional_telemetry
|
from onyx.utils.telemetry import optional_telemetry
|
||||||
from onyx.utils.telemetry import RecordType
|
from onyx.utils.telemetry import RecordType
|
||||||
|
|
||||||
|
|
||||||
_MONITORING_SOFT_TIME_LIMIT = 60 * 5 # 5 minutes
|
_MONITORING_SOFT_TIME_LIMIT = 60 * 5 # 5 minutes
|
||||||
_MONITORING_TIME_LIMIT = _MONITORING_SOFT_TIME_LIMIT + 60 # 6 minutes
|
_MONITORING_TIME_LIMIT = _MONITORING_SOFT_TIME_LIMIT + 60 # 6 minutes
|
||||||
|
|
||||||
@@ -49,6 +52,12 @@ _CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT = (
|
|||||||
"monitoring_connector_index_attempt_run_success:{cc_pair_id}:{index_attempt_id}"
|
"monitoring_connector_index_attempt_run_success:{cc_pair_id}:{index_attempt_id}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_FINAL_METRIC_KEY_FMT = "sync_final_metrics:{sync_type}:{entity_id}:{sync_record_id}"
|
||||||
|
|
||||||
|
_SYNC_START_LATENCY_KEY_FMT = (
|
||||||
|
"sync_start_latency:{sync_type}:{entity_id}:{sync_record_id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _mark_metric_as_emitted(redis_std: Redis, key: str) -> None:
|
def _mark_metric_as_emitted(redis_std: Redis, key: str) -> None:
|
||||||
"""Mark a metric as having been emitted by setting a Redis key with expiration"""
|
"""Mark a metric as having been emitted by setting a Redis key with expiration"""
|
||||||
@@ -111,6 +120,7 @@ class Metric(BaseModel):
|
|||||||
}.items()
|
}.items()
|
||||||
if v is not None
|
if v is not None
|
||||||
}
|
}
|
||||||
|
task_logger.info(f"Emitting metric: {data}")
|
||||||
optional_telemetry(
|
optional_telemetry(
|
||||||
record_type=RecordType.METRIC,
|
record_type=RecordType.METRIC,
|
||||||
data=data,
|
data=data,
|
||||||
@@ -189,66 +199,129 @@ def _build_connector_start_latency_metric(
|
|||||||
f"Start latency for index attempt {recent_attempt.id}: {start_latency:.2f}s "
|
f"Start latency for index attempt {recent_attempt.id}: {start_latency:.2f}s "
|
||||||
f"(desired: {desired_start_time}, actual: {recent_attempt.time_started})"
|
f"(desired: {desired_start_time}, actual: {recent_attempt.time_started})"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
job_id = build_job_id("connector", str(cc_pair.id), str(recent_attempt.id))
|
||||||
|
|
||||||
return Metric(
|
return Metric(
|
||||||
key=metric_key,
|
key=metric_key,
|
||||||
name="connector_start_latency",
|
name="connector_start_latency",
|
||||||
value=start_latency,
|
value=start_latency,
|
||||||
tags={},
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
|
"connector_id": str(cc_pair.connector.id),
|
||||||
|
"source": str(cc_pair.connector.source),
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _build_run_success_metrics(
|
def _build_connector_final_metrics(
|
||||||
cc_pair: ConnectorCredentialPair,
|
cc_pair: ConnectorCredentialPair,
|
||||||
recent_attempts: list[IndexAttempt],
|
recent_attempts: list[IndexAttempt],
|
||||||
redis_std: Redis,
|
redis_std: Redis,
|
||||||
) -> list[Metric]:
|
) -> list[Metric]:
|
||||||
|
"""
|
||||||
|
Final metrics for connector index attempts:
|
||||||
|
- Boolean success/fail metric
|
||||||
|
- If success, emit:
|
||||||
|
* duration (seconds)
|
||||||
|
* doc_count
|
||||||
|
"""
|
||||||
metrics = []
|
metrics = []
|
||||||
for attempt in recent_attempts:
|
for attempt in recent_attempts:
|
||||||
metric_key = _CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT.format(
|
metric_key = _CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT.format(
|
||||||
cc_pair_id=cc_pair.id,
|
cc_pair_id=cc_pair.id,
|
||||||
index_attempt_id=attempt.id,
|
index_attempt_id=attempt.id,
|
||||||
)
|
)
|
||||||
|
|
||||||
if _has_metric_been_emitted(redis_std, metric_key):
|
if _has_metric_been_emitted(redis_std, metric_key):
|
||||||
task_logger.info(
|
task_logger.info(
|
||||||
f"Skipping metric for connector {cc_pair.connector.id} "
|
f"Skipping final metrics for connector {cc_pair.connector.id} "
|
||||||
f"index attempt {attempt.id} because it has already been "
|
f"index attempt {attempt.id}, already emitted."
|
||||||
"emitted"
|
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if attempt.status in [
|
# We only emit final metrics if the attempt is in a terminal state
|
||||||
|
if attempt.status not in [
|
||||||
IndexingStatus.SUCCESS,
|
IndexingStatus.SUCCESS,
|
||||||
IndexingStatus.FAILED,
|
IndexingStatus.FAILED,
|
||||||
IndexingStatus.CANCELED,
|
IndexingStatus.CANCELED,
|
||||||
]:
|
]:
|
||||||
task_logger.info(
|
# Not finished; skip
|
||||||
f"Adding run success metric for index attempt {attempt.id} with status {attempt.status}"
|
continue
|
||||||
)
|
|
||||||
|
job_id = build_job_id("connector", str(cc_pair.id), str(attempt.id))
|
||||||
|
success = attempt.status == IndexingStatus.SUCCESS
|
||||||
metrics.append(
|
metrics.append(
|
||||||
Metric(
|
Metric(
|
||||||
key=metric_key,
|
key=metric_key, # We'll mark the same key for any final metrics
|
||||||
name="connector_run_succeeded",
|
name="connector_run_succeeded",
|
||||||
value=attempt.status == IndexingStatus.SUCCESS,
|
value=success,
|
||||||
tags={"source": str(cc_pair.connector.source)},
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
|
"connector_id": str(cc_pair.connector.id),
|
||||||
|
"source": str(cc_pair.connector.source),
|
||||||
|
"status": attempt.status.value,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
# Make sure we have valid time_started
|
||||||
|
if attempt.time_started and attempt.time_updated:
|
||||||
|
duration_seconds = (
|
||||||
|
attempt.time_updated - attempt.time_started
|
||||||
|
).total_seconds()
|
||||||
|
metrics.append(
|
||||||
|
Metric(
|
||||||
|
key=None, # No need for a new key, or you can reuse the same if you prefer
|
||||||
|
name="connector_index_duration_seconds",
|
||||||
|
value=duration_seconds,
|
||||||
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
|
"connector_id": str(cc_pair.connector.id),
|
||||||
|
"source": str(cc_pair.connector.source),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
task_logger.error(
|
||||||
|
f"Index attempt {attempt.id} succeeded but has missing time "
|
||||||
|
f"(time_started={attempt.time_started}, time_updated={attempt.time_updated})."
|
||||||
|
)
|
||||||
|
|
||||||
|
# For doc counts, choose whichever field is more relevant
|
||||||
|
doc_count = attempt.total_docs_indexed or 0
|
||||||
|
metrics.append(
|
||||||
|
Metric(
|
||||||
|
key=None,
|
||||||
|
name="connector_index_doc_count",
|
||||||
|
value=doc_count,
|
||||||
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
|
"connector_id": str(cc_pair.connector.id),
|
||||||
|
"source": str(cc_pair.connector.source),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
_mark_metric_as_emitted(redis_std, metric_key)
|
||||||
|
|
||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
|
def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
|
||||||
"""Collect metrics about connector runs from the past hour"""
|
"""Collect metrics about connector runs from the past hour"""
|
||||||
# NOTE: use get_db_current_time since the IndexAttempt times are set based on DB time
|
|
||||||
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
|
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
|
||||||
|
|
||||||
# Get all connector credential pairs
|
# Get all connector credential pairs
|
||||||
cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
|
cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
|
||||||
|
# Might be more than one search setting, or just one
|
||||||
active_search_settings = get_active_search_settings(db_session)
|
active_search_settings = get_active_search_settings(db_session)
|
||||||
|
|
||||||
metrics = []
|
metrics = []
|
||||||
|
|
||||||
for cc_pair, search_settings in zip(cc_pairs, active_search_settings):
|
# If you want to process each cc_pair against each search setting:
|
||||||
|
for cc_pair in cc_pairs:
|
||||||
|
for search_settings in active_search_settings:
|
||||||
recent_attempts = (
|
recent_attempts = (
|
||||||
db_session.query(IndexAttempt)
|
db_session.query(IndexAttempt)
|
||||||
.filter(
|
.filter(
|
||||||
@@ -259,6 +332,7 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
|
|||||||
.limit(2)
|
.limit(2)
|
||||||
.all()
|
.all()
|
||||||
)
|
)
|
||||||
|
|
||||||
if not recent_attempts:
|
if not recent_attempts:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -274,95 +348,150 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
|
|||||||
start_latency_metric = _build_connector_start_latency_metric(
|
start_latency_metric = _build_connector_start_latency_metric(
|
||||||
cc_pair, most_recent_attempt, second_most_recent_attempt, redis_std
|
cc_pair, most_recent_attempt, second_most_recent_attempt, redis_std
|
||||||
)
|
)
|
||||||
|
|
||||||
if start_latency_metric:
|
if start_latency_metric:
|
||||||
metrics.append(start_latency_metric)
|
metrics.append(start_latency_metric)
|
||||||
|
|
||||||
# Connector run success/failure
|
# Connector run success/failure
|
||||||
run_success_metrics = _build_run_success_metrics(
|
final_metrics = _build_connector_final_metrics(
|
||||||
cc_pair, recent_attempts, redis_std
|
cc_pair, recent_attempts, redis_std
|
||||||
)
|
)
|
||||||
metrics.extend(run_success_metrics)
|
metrics.extend(final_metrics)
|
||||||
|
|
||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
|
def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
|
||||||
"""Collect metrics about document set and group syncing speed"""
|
"""
|
||||||
# NOTE: use get_db_current_time since the SyncRecord times are set based on DB time
|
Collect metrics for document set and group syncing:
|
||||||
|
- Success/failure status
|
||||||
|
- Start latency (always)
|
||||||
|
- Duration & doc count (only if success)
|
||||||
|
- Throughput (docs/min) (only if success)
|
||||||
|
"""
|
||||||
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
|
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
|
||||||
|
|
||||||
# Get all sync records from the last hour
|
# Get all sync records that ended in the last hour
|
||||||
recent_sync_records = db_session.scalars(
|
recent_sync_records = db_session.scalars(
|
||||||
select(SyncRecord)
|
select(SyncRecord)
|
||||||
.where(SyncRecord.sync_start_time >= one_hour_ago)
|
.where(SyncRecord.sync_end_time.isnot(None))
|
||||||
.order_by(SyncRecord.sync_start_time.desc())
|
.where(SyncRecord.sync_end_time >= one_hour_ago)
|
||||||
|
.order_by(SyncRecord.sync_end_time.desc())
|
||||||
).all()
|
).all()
|
||||||
|
|
||||||
|
task_logger.info(
|
||||||
|
f"Collecting sync metrics for {len(recent_sync_records)} sync records"
|
||||||
|
)
|
||||||
|
|
||||||
metrics = []
|
metrics = []
|
||||||
|
|
||||||
for sync_record in recent_sync_records:
|
for sync_record in recent_sync_records:
|
||||||
# Skip if no end time (sync still in progress)
|
# Build a job_id for correlation
|
||||||
if not sync_record.sync_end_time:
|
job_id = build_job_id("sync_record", str(sync_record.id))
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if we already emitted a metric for this sync record
|
# Emit a SUCCESS/FAIL boolean metric
|
||||||
metric_key = (
|
# Use a single Redis key to avoid re-emitting final metrics
|
||||||
f"sync_speed:{sync_record.sync_type}:"
|
final_metric_key = _FINAL_METRIC_KEY_FMT.format(
|
||||||
f"{sync_record.entity_id}:{sync_record.id}"
|
sync_type=sync_record.sync_type,
|
||||||
|
entity_id=sync_record.entity_id,
|
||||||
|
sync_record_id=sync_record.id,
|
||||||
)
|
)
|
||||||
if _has_metric_been_emitted(redis_std, metric_key):
|
if not _has_metric_been_emitted(redis_std, final_metric_key):
|
||||||
task_logger.info(
|
# Evaluate success
|
||||||
f"Skipping metric for sync record {sync_record.id} "
|
sync_succeeded = sync_record.sync_status == SyncStatus.SUCCESS
|
||||||
"because it has already been emitted"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Calculate sync duration in minutes
|
|
||||||
sync_duration_mins = (
|
|
||||||
sync_record.sync_end_time - sync_record.sync_start_time
|
|
||||||
).total_seconds() / 60.0
|
|
||||||
|
|
||||||
# Calculate sync speed (docs/min) - avoid division by zero
|
|
||||||
sync_speed = (
|
|
||||||
sync_record.num_docs_synced / sync_duration_mins
|
|
||||||
if sync_duration_mins > 0
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
if sync_speed is None:
|
|
||||||
task_logger.error(
|
|
||||||
f"Something went wrong with sync speed calculation. "
|
|
||||||
f"Sync record: {sync_record.id}, duration: {sync_duration_mins}, "
|
|
||||||
f"docs synced: {sync_record.num_docs_synced}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
task_logger.info(
|
|
||||||
f"Calculated sync speed for record {sync_record.id}: {sync_speed} docs/min"
|
|
||||||
)
|
|
||||||
metrics.append(
|
metrics.append(
|
||||||
Metric(
|
Metric(
|
||||||
key=metric_key,
|
key=final_metric_key,
|
||||||
name="sync_speed_docs_per_min",
|
name="sync_run_succeeded",
|
||||||
value=sync_speed,
|
value=sync_succeeded,
|
||||||
tags={
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
"sync_type": str(sync_record.sync_type),
|
"sync_type": str(sync_record.sync_type),
|
||||||
"status": str(sync_record.sync_status),
|
"status": str(sync_record.sync_status),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add sync start latency metric
|
# If successful, emit additional metrics
|
||||||
start_latency_key = (
|
if sync_succeeded:
|
||||||
f"sync_start_latency:{sync_record.sync_type}"
|
if sync_record.sync_end_time and sync_record.sync_start_time:
|
||||||
f":{sync_record.entity_id}:{sync_record.id}"
|
duration_seconds = (
|
||||||
|
sync_record.sync_end_time - sync_record.sync_start_time
|
||||||
|
).total_seconds()
|
||||||
|
else:
|
||||||
|
task_logger.error(
|
||||||
|
f"Invalid times for sync record {sync_record.id}: "
|
||||||
|
f"start={sync_record.sync_start_time}, end={sync_record.sync_end_time}"
|
||||||
)
|
)
|
||||||
if _has_metric_been_emitted(redis_std, start_latency_key):
|
duration_seconds = None
|
||||||
task_logger.info(
|
|
||||||
f"Skipping start latency metric for sync record {sync_record.id} "
|
|
||||||
"because it has already been emitted"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
doc_count = sync_record.num_docs_synced or 0
|
||||||
|
|
||||||
|
sync_speed = None
|
||||||
|
if duration_seconds and duration_seconds > 0:
|
||||||
|
duration_mins = duration_seconds / 60.0
|
||||||
|
sync_speed = (
|
||||||
|
doc_count / duration_mins if duration_mins > 0 else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Emit duration, doc count, speed
|
||||||
|
if duration_seconds is not None:
|
||||||
|
metrics.append(
|
||||||
|
Metric(
|
||||||
|
key=None,
|
||||||
|
name="sync_duration_seconds",
|
||||||
|
value=duration_seconds,
|
||||||
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
|
"sync_type": str(sync_record.sync_type),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
task_logger.error(
|
||||||
|
f"Invalid sync record {sync_record.id} with no duration"
|
||||||
|
)
|
||||||
|
|
||||||
|
metrics.append(
|
||||||
|
Metric(
|
||||||
|
key=None,
|
||||||
|
name="sync_doc_count",
|
||||||
|
value=doc_count,
|
||||||
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
|
"sync_type": str(sync_record.sync_type),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if sync_speed is not None:
|
||||||
|
metrics.append(
|
||||||
|
Metric(
|
||||||
|
key=None,
|
||||||
|
name="sync_speed_docs_per_min",
|
||||||
|
value=sync_speed,
|
||||||
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
|
"sync_type": str(sync_record.sync_type),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
task_logger.error(
|
||||||
|
f"Invalid sync record {sync_record.id} with no duration"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mark final metrics as emitted so we don't re-emit
|
||||||
|
_mark_metric_as_emitted(redis_std, final_metric_key)
|
||||||
|
|
||||||
|
# Emit start latency
|
||||||
|
start_latency_key = _SYNC_START_LATENCY_KEY_FMT.format(
|
||||||
|
sync_type=sync_record.sync_type,
|
||||||
|
entity_id=sync_record.entity_id,
|
||||||
|
sync_record_id=sync_record.id,
|
||||||
|
)
|
||||||
|
if not _has_metric_been_emitted(redis_std, start_latency_key):
|
||||||
# Get the entity's last update time based on sync type
|
# Get the entity's last update time based on sync type
|
||||||
entity: DocumentSet | UserGroup | None = None
|
entity: DocumentSet | UserGroup | None = None
|
||||||
if sync_record.sync_type == SyncType.DOCUMENT_SET:
|
if sync_record.sync_type == SyncType.DOCUMENT_SET:
|
||||||
@@ -374,35 +503,29 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
|||||||
select(UserGroup).where(UserGroup.id == sync_record.entity_id)
|
select(UserGroup).where(UserGroup.id == sync_record.entity_id)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Skip other sync types
|
|
||||||
task_logger.info(
|
task_logger.info(
|
||||||
f"Skipping sync record {sync_record.id} "
|
f"Skipping sync record {sync_record.id} of type {sync_record.sync_type}."
|
||||||
f"with type {sync_record.sync_type} "
|
|
||||||
f"and id {sync_record.entity_id} "
|
|
||||||
"because it is not a document set or user group"
|
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if entity is None:
|
if entity is None:
|
||||||
task_logger.error(
|
task_logger.error(
|
||||||
f"Could not find entity for sync record {sync_record.id} "
|
f"Could not find entity for sync record {sync_record.id} "
|
||||||
f"with type {sync_record.sync_type} and id {sync_record.entity_id}"
|
f"(type={sync_record.sync_type}, id={sync_record.entity_id})."
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Calculate start latency in seconds
|
# Calculate start latency in seconds:
|
||||||
|
# (actual sync start) - (last modified time)
|
||||||
|
if entity.time_last_modified_by_user and sync_record.sync_start_time:
|
||||||
start_latency = (
|
start_latency = (
|
||||||
sync_record.sync_start_time - entity.time_last_modified_by_user
|
sync_record.sync_start_time - entity.time_last_modified_by_user
|
||||||
).total_seconds()
|
).total_seconds()
|
||||||
task_logger.info(
|
|
||||||
f"Calculated start latency for sync record {sync_record.id}: {start_latency} seconds"
|
|
||||||
)
|
|
||||||
if start_latency < 0:
|
if start_latency < 0:
|
||||||
task_logger.error(
|
task_logger.error(
|
||||||
f"Start latency is negative for sync record {sync_record.id} "
|
f"Negative start latency for sync record {sync_record.id} "
|
||||||
f"with type {sync_record.sync_type} and id {sync_record.entity_id}. "
|
f"(start={sync_record.sync_start_time}, entity_modified={entity.time_last_modified_by_user})"
|
||||||
f"Sync start time: {sync_record.sync_start_time}, "
|
|
||||||
f"Entity last modified: {entity.time_last_modified_by_user}"
|
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -412,14 +535,32 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
|||||||
name="sync_start_latency_seconds",
|
name="sync_start_latency_seconds",
|
||||||
value=start_latency,
|
value=start_latency,
|
||||||
tags={
|
tags={
|
||||||
|
"job_id": job_id,
|
||||||
"sync_type": str(sync_record.sync_type),
|
"sync_type": str(sync_record.sync_type),
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_mark_metric_as_emitted(redis_std, start_latency_key)
|
||||||
|
|
||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
def build_job_id(
|
||||||
|
job_type: Literal["connector", "sync_record"],
|
||||||
|
primary_id: str,
|
||||||
|
secondary_id: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
if job_type == "connector":
|
||||||
|
if secondary_id is None:
|
||||||
|
raise ValueError(
|
||||||
|
"secondary_id (attempt_id) is required for connector job_type"
|
||||||
|
)
|
||||||
|
return f"connector:{primary_id}:attempt:{secondary_id}"
|
||||||
|
elif job_type == "sync_record":
|
||||||
|
return f"sync_record:{primary_id}"
|
||||||
|
|
||||||
|
|
||||||
@shared_task(
|
@shared_task(
|
||||||
name=OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
|
name=OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
|
||||||
soft_time_limit=_MONITORING_SOFT_TIME_LIMIT,
|
soft_time_limit=_MONITORING_SOFT_TIME_LIMIT,
|
||||||
@@ -459,6 +600,7 @@ def monitor_background_processes(self: Task, *, tenant_id: str | None) -> None:
|
|||||||
lambda: _collect_connector_metrics(db_session, redis_std),
|
lambda: _collect_connector_metrics(db_session, redis_std),
|
||||||
lambda: _collect_sync_metrics(db_session, redis_std),
|
lambda: _collect_sync_metrics(db_session, redis_std),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Collect and log each metric
|
# Collect and log each metric
|
||||||
with get_session_with_tenant(tenant_id) as db_session:
|
with get_session_with_tenant(tenant_id) as db_session:
|
||||||
for metric_fn in metric_functions:
|
for metric_fn in metric_functions:
|
||||||
|
@@ -8,20 +8,64 @@ from sqlalchemy.orm import Session
|
|||||||
from onyx.db.enums import SyncStatus
|
from onyx.db.enums import SyncStatus
|
||||||
from onyx.db.enums import SyncType
|
from onyx.db.enums import SyncType
|
||||||
from onyx.db.models import SyncRecord
|
from onyx.db.models import SyncRecord
|
||||||
|
from onyx.setup import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
def insert_sync_record(
|
def insert_sync_record(
|
||||||
db_session: Session,
|
db_session: Session,
|
||||||
entity_id: int | None,
|
entity_id: int,
|
||||||
sync_type: SyncType,
|
sync_type: SyncType,
|
||||||
) -> SyncRecord:
|
) -> SyncRecord:
|
||||||
"""Insert a new sync record into the database.
|
"""Insert a new sync record into the database, cancelling any existing in-progress records.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
db_session: The database session to use
|
db_session: The database session to use
|
||||||
entity_id: The ID of the entity being synced (document set ID, user group ID, etc.)
|
entity_id: The ID of the entity being synced (document set ID, user group ID, etc.)
|
||||||
sync_type: The type of sync operation
|
sync_type: The type of sync operation
|
||||||
"""
|
"""
|
||||||
|
# If an existing in-progress sync record exists, mark as cancelled
|
||||||
|
existing_in_progress_sync_record = fetch_latest_sync_record(
|
||||||
|
db_session, entity_id, sync_type, sync_status=SyncStatus.IN_PROGRESS
|
||||||
|
)
|
||||||
|
|
||||||
|
if existing_in_progress_sync_record is not None:
|
||||||
|
logger.info(
|
||||||
|
f"Cancelling existing in-progress sync record {existing_in_progress_sync_record.id} "
|
||||||
|
f"for entity_id={entity_id} sync_type={sync_type}"
|
||||||
|
)
|
||||||
|
mark_sync_records_as_cancelled(db_session, entity_id, sync_type)
|
||||||
|
|
||||||
|
return _create_sync_record(db_session, entity_id, sync_type)
|
||||||
|
|
||||||
|
|
||||||
|
def mark_sync_records_as_cancelled(
|
||||||
|
db_session: Session,
|
||||||
|
entity_id: int | None,
|
||||||
|
sync_type: SyncType,
|
||||||
|
) -> None:
|
||||||
|
stmt = (
|
||||||
|
update(SyncRecord)
|
||||||
|
.where(
|
||||||
|
and_(
|
||||||
|
SyncRecord.entity_id == entity_id,
|
||||||
|
SyncRecord.sync_type == sync_type,
|
||||||
|
SyncRecord.sync_status == SyncStatus.IN_PROGRESS,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.values(sync_status=SyncStatus.CANCELED)
|
||||||
|
)
|
||||||
|
db_session.execute(stmt)
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def _create_sync_record(
|
||||||
|
db_session: Session,
|
||||||
|
entity_id: int | None,
|
||||||
|
sync_type: SyncType,
|
||||||
|
) -> SyncRecord:
|
||||||
|
"""Create and insert a new sync record into the database."""
|
||||||
sync_record = SyncRecord(
|
sync_record = SyncRecord(
|
||||||
entity_id=entity_id,
|
entity_id=entity_id,
|
||||||
sync_type=sync_type,
|
sync_type=sync_type,
|
||||||
@@ -39,6 +83,7 @@ def fetch_latest_sync_record(
|
|||||||
db_session: Session,
|
db_session: Session,
|
||||||
entity_id: int,
|
entity_id: int,
|
||||||
sync_type: SyncType,
|
sync_type: SyncType,
|
||||||
|
sync_status: SyncStatus | None = None,
|
||||||
) -> SyncRecord | None:
|
) -> SyncRecord | None:
|
||||||
"""Fetch the most recent sync record for a given entity ID and status.
|
"""Fetch the most recent sync record for a given entity ID and status.
|
||||||
|
|
||||||
@@ -59,6 +104,9 @@ def fetch_latest_sync_record(
|
|||||||
.limit(1)
|
.limit(1)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if sync_status is not None:
|
||||||
|
stmt = stmt.where(SyncRecord.sync_status == sync_status)
|
||||||
|
|
||||||
result = db_session.execute(stmt)
|
result = db_session.execute(stmt)
|
||||||
return result.scalar_one_or_none()
|
return result.scalar_one_or_none()
|
||||||
|
|
||||||
|
@@ -123,6 +123,7 @@ def optional_telemetry(
|
|||||||
headers={"Content-Type": "application/json"},
|
headers={"Content-Type": "application/json"},
|
||||||
json=payload,
|
json=payload,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# This way it silences all thread level logging as well
|
# This way it silences all thread level logging as well
|
||||||
pass
|
pass
|
||||||
|
Reference in New Issue
Block a user