From 945d684afd981e178fa2ed23ef59e00d53fc9080 Mon Sep 17 00:00:00 2001 From: Multica Eve Date: Thu, 4 Jun 2026 11:21:51 +0800 Subject: [PATCH] MUL-2965 fix(metrics): harden business sampler quality (#3738) * fix(metrics): harden business sampler quality (MUL-2965) Co-authored-by: multica-agent * fix(metrics): alert on sampler acquire failures Co-authored-by: multica-agent --------- Co-authored-by: Eve Co-authored-by: multica-agent --- .../multica/templates/prometheusrule.yaml | 43 +++++++++++++++++++ deploy/helm/multica/values.yaml | 14 ++++++ server/internal/metrics/business_sampler.go | 12 +++--- .../metrics/business_sampler_queries.go | 28 ++++++++---- .../internal/metrics/business_sampler_test.go | 16 ++++--- ...sk_queue_running_started_at_index.down.sql | 1 + ...task_queue_running_started_at_index.up.sql | 11 +++++ ..._agent_runtime_last_seen_at_index.down.sql | 1 + ...15_agent_runtime_last_seen_at_index.up.sql | 8 ++++ 9 files changed, 113 insertions(+), 21 deletions(-) create mode 100644 deploy/helm/multica/templates/prometheusrule.yaml create mode 100644 server/migrations/114_agent_task_queue_running_started_at_index.down.sql create mode 100644 server/migrations/114_agent_task_queue_running_started_at_index.up.sql create mode 100644 server/migrations/115_agent_runtime_last_seen_at_index.down.sql create mode 100644 server/migrations/115_agent_runtime_last_seen_at_index.up.sql diff --git a/deploy/helm/multica/templates/prometheusrule.yaml b/deploy/helm/multica/templates/prometheusrule.yaml new file mode 100644 index 000000000..6b4640e76 --- /dev/null +++ b/deploy/helm/multica/templates/prometheusrule.yaml @@ -0,0 +1,43 @@ +{{- if .Values.monitoring.prometheusRule.enabled }} +{{- $samplerQueryNames := "active_users|active_workspaces|task_queued|task_running|task_stuck|runtime_online|runtime_heartbeat_age|workspace_total" -}} +{{- $samplerErrorNames := printf "acquire|%s" $samplerQueryNames -}} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "multica.backend.fullname" . }}-sampler + labels: + {{- include "multica.labels" . | nindent 4 }} + app.kubernetes.io/component: backend + {{- with .Values.monitoring.prometheusRule.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: multica-business-sampler + rules: + - alert: MulticaBusinessSamplerQueryErrors + expr: | + sum by (name) ( + rate(multica_business_sampler_query_errors_total{name=~"{{ $samplerErrorNames }}"}[10m]) + ) > 0 + for: {{ .Values.monitoring.prometheusRule.samplerQueryErrorsFor | quote }} + labels: + severity: {{ .Values.monitoring.prometheusRule.severity | quote }} + annotations: + summary: "Business sampler query errors" + description: "Business sampler query {{ "{{ $labels.name }}" }} is returning errors. Metrics may go stale if the query keeps timing out." + - alert: MulticaBusinessSamplerQueryLatencyHigh + expr: | + histogram_quantile( + 0.95, + sum by (le, name) ( + rate(multica_business_sampler_query_seconds_bucket{name=~"{{ $samplerQueryNames }}"}[10m]) + ) + ) > 0.3 + for: {{ .Values.monitoring.prometheusRule.samplerQueryLatencyFor | quote }} + labels: + severity: {{ .Values.monitoring.prometheusRule.severity | quote }} + annotations: + summary: "Business sampler query p95 is high" + description: "Business sampler query {{ "{{ $labels.name }}" }} p95 is above 300ms, leaving little headroom before the 500ms statement_timeout." +{{- end }} diff --git a/deploy/helm/multica/values.yaml b/deploy/helm/multica/values.yaml index 1d3f35ca9..5e3fa5fb9 100644 --- a/deploy/helm/multica/values.yaml +++ b/deploy/helm/multica/values.yaml @@ -160,3 +160,17 @@ ingress: # tls: # - hosts: [multica.dev.lan, api.multica.dev.lan] # secretName: multica-tls + +# ----------------------------------------------------------------------------- +# Monitoring +# ----------------------------------------------------------------------------- +monitoring: + prometheusRule: + # Requires the Prometheus Operator CRD (monitoring.coreos.com/v1). Kept + # disabled by default so minimal self-host installs without that CRD still + # render and install. + enabled: false + additionalLabels: {} + samplerQueryErrorsFor: 5m + samplerQueryLatencyFor: 10m + severity: warning diff --git a/server/internal/metrics/business_sampler.go b/server/internal/metrics/business_sampler.go index 97bb3386f..77109d9ca 100644 --- a/server/internal/metrics/business_sampler.go +++ b/server/internal/metrics/business_sampler.go @@ -29,12 +29,12 @@ const ( defaultSamplerQueryTimeout = 500 * time.Millisecond samplerRowLimit = 100 - // Active-user / active-workspace windows. The label set is fixed — - // new windows must be added explicitly here so cardinality stays - // bounded under all input. + // Active-user / active-workspace DB windows. Keep this DB-sampled path + // to the short window only: PR2's counters do not carry user/workspace + // IDs, so PromQL cannot derive distinct 1h/24h active estimates without + // adding high-cardinality labels. Long-window actives need a separate + // counter-derived aggregation, not expanding this sampler over history. windowFiveMinutes = "5m" - windowOneHour = "1h" - windowOneDay = "24h" // Runtime is considered online if its last_seen_at is within this // many seconds of `now()`. 60s matches the daemon heartbeat cadence @@ -54,8 +54,6 @@ var samplerWindows = []struct { d time.Duration }{ {windowFiveMinutes, 5 * time.Minute}, - {windowOneHour, time.Hour}, - {windowOneDay, 24 * time.Hour}, } // BusinessSamplerOptions configures the BusinessSamplerCollector. A nil diff --git a/server/internal/metrics/business_sampler_queries.go b/server/internal/metrics/business_sampler_queries.go index 76baa2847..9e3f6b927 100644 --- a/server/internal/metrics/business_sampler_queries.go +++ b/server/internal/metrics/business_sampler_queries.go @@ -19,9 +19,10 @@ import ( // queryActiveUsers fills snap.activeUsers with a count of distinct user_ids // that have either sent a chat message or had an agent task created for one -// of their issues inside each rolling window. We deliberately skip Postgres -// `now() - interval '$1'` substitution and instead pass a typed interval as -// a bind parameter so the caller cannot inject SQL via the window string. +// of their issues inside the short rolling window. We deliberately skip +// Postgres `now() - interval '$1'` substitution and instead pass a typed +// interval as a bind parameter so the caller cannot inject SQL via the +// window string. // // Note on cardinality: there is intentionally NO `LIMIT` on the inner // distinct subquery — capping it would silently truncate the COUNT to 100 @@ -62,9 +63,9 @@ SELECT count(DISTINCT user_id) FROM ( } // queryActiveWorkspaces counts distinct workspaces with chat or task -// activity in each window. Mirrors queryActiveUsers and intentionally has -// no inner LIMIT for the same reason: a LIMIT inside the distinct subquery -// would truncate the COUNT and report a wrong value. +// activity in the short window. Mirrors queryActiveUsers and intentionally +// has no inner LIMIT for the same reason: a LIMIT inside the distinct +// subquery would truncate the COUNT and report a wrong value. func (c *BusinessSamplerCollector) queryActiveWorkspaces( ctx context.Context, tx pgx.Tx, snap *samplerSnapshot, ) error { @@ -140,7 +141,19 @@ LIMIT 100 func (c *BusinessSamplerCollector) queryTaskRunning( ctx context.Context, tx pgx.Tx, snap *samplerSnapshot, ) error { + // Keep dispatched and running in separate UNION ALL branches so + // Postgres can match the existing dispatched partial index and the + // running-only partial index from migration 114 independently. const stmt = ` +WITH in_flight AS ( + SELECT chat_session_id, autopilot_run_id, issue_id, runtime_id + FROM agent_task_queue + WHERE status = 'dispatched' + UNION ALL + SELECT chat_session_id, autopilot_run_id, issue_id, runtime_id + FROM agent_task_queue + WHERE status = 'running' +) SELECT CASE WHEN atq.chat_session_id IS NOT NULL THEN 'chat' @@ -150,9 +163,8 @@ SELECT END AS source, COALESCE(ar.runtime_mode, 'unknown') AS runtime_mode, count(*) AS n -FROM agent_task_queue atq +FROM in_flight atq LEFT JOIN agent_runtime ar ON ar.id = atq.runtime_id -WHERE atq.status IN ('dispatched', 'running') GROUP BY 1, 2 LIMIT 100 ` diff --git a/server/internal/metrics/business_sampler_test.go b/server/internal/metrics/business_sampler_test.go index 9728b8ff4..6d3e4b9dd 100644 --- a/server/internal/metrics/business_sampler_test.go +++ b/server/internal/metrics/business_sampler_test.go @@ -42,11 +42,7 @@ func newTestSampler(t *testing.T, refresh func(ctx context.Context, now time.Tim func filledSnapshot(now time.Time) *samplerSnapshot { snap := newSamplerSnapshot(now) snap.activeUsers[windowFiveMinutes] = 7 - snap.activeUsers[windowOneHour] = 42 - snap.activeUsers[windowOneDay] = 100 snap.activeWorkspaces[windowFiveMinutes] = 3 - snap.activeWorkspaces[windowOneHour] = 11 - snap.activeWorkspaces[windowOneDay] = 25 snap.taskQueued["chat"] = 5 snap.taskQueued["issue"] = 2 @@ -110,8 +106,6 @@ func TestBusinessSamplerCollectorEmitsExpectedMetrics(t *testing.T) { wantSubstrings := []string{ `multica_active_users{window="5m"} 7`, - `multica_active_users{window="1h"} 42`, - `multica_active_users{window="24h"} 100`, `multica_active_workspaces{window="5m"} 3`, `multica_agent_task_queued{source="chat"} 5`, `multica_agent_task_queued{source="issue"} 2`, @@ -132,6 +126,16 @@ func TestBusinessSamplerCollectorEmitsExpectedMetrics(t *testing.T) { t.Errorf("metrics body missing %q\nbody:\n%s", want, body) } } + for _, removed := range []string{ + `multica_active_users{window="1h"}`, + `multica_active_users{window="24h"}`, + `multica_active_workspaces{window="1h"}`, + `multica_active_workspaces{window="24h"}`, + } { + if strings.Contains(body, removed) { + t.Errorf("metrics body still exposes removed long DB window %q\nbody:\n%s", removed, body) + } + } } // TestBusinessSamplerSelfIntrospectionHistogramIsExposed observes a value diff --git a/server/migrations/114_agent_task_queue_running_started_at_index.down.sql b/server/migrations/114_agent_task_queue_running_started_at_index.down.sql new file mode 100644 index 000000000..ef42ac85d --- /dev/null +++ b/server/migrations/114_agent_task_queue_running_started_at_index.down.sql @@ -0,0 +1 @@ +DROP INDEX CONCURRENTLY IF EXISTS idx_agent_task_queue_running_started_at; diff --git a/server/migrations/114_agent_task_queue_running_started_at_index.up.sql b/server/migrations/114_agent_task_queue_running_started_at_index.up.sql new file mode 100644 index 000000000..cc5f185c8 --- /dev/null +++ b/server/migrations/114_agent_task_queue_running_started_at_index.up.sql @@ -0,0 +1,11 @@ +-- Covers sampler reads over running tasks: +-- * task_stuck: status = 'running' AND started_at < now() - interval '30 minutes' +-- * task_running: the running half of the in-flight task count +-- +-- agent_task_queue is hot, so this must stay in its own single-statement +-- migration and must use CONCURRENTLY. The migration runner executes files +-- outside an explicit transaction; keeping CONCURRENTLY isolated avoids +-- Postgres' implicit transaction block for multi-statement query strings. +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_agent_task_queue_running_started_at + ON agent_task_queue (started_at) + WHERE status = 'running'; diff --git a/server/migrations/115_agent_runtime_last_seen_at_index.down.sql b/server/migrations/115_agent_runtime_last_seen_at_index.down.sql new file mode 100644 index 000000000..e470c24a9 --- /dev/null +++ b/server/migrations/115_agent_runtime_last_seen_at_index.down.sql @@ -0,0 +1 @@ +DROP INDEX CONCURRENTLY IF EXISTS idx_agent_runtime_last_seen_at; diff --git a/server/migrations/115_agent_runtime_last_seen_at_index.up.sql b/server/migrations/115_agent_runtime_last_seen_at_index.up.sql new file mode 100644 index 000000000..c470007e8 --- /dev/null +++ b/server/migrations/115_agent_runtime_last_seen_at_index.up.sql @@ -0,0 +1,8 @@ +-- Covers sampler reads over runtime heartbeats: +-- * runtime_online: last_seen_at > now() - online window +-- * runtime_heartbeat_age: last_seen_at > now() - 15 minutes ORDER BY last_seen_at DESC +-- +-- agent_runtime heartbeat writes are frequent, so this must stay in its own +-- single-statement migration and must use CONCURRENTLY. +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_agent_runtime_last_seen_at + ON agent_runtime (last_seen_at);