From 9a5d8a52f3ffcfcd41eefbfb172e852b5bca9b8d Mon Sep 17 00:00:00 2001 From: Bohan Jiang <52446949+Bohan-J@users.noreply.github.com> Date: Thu, 21 May 2026 16:26:42 +0800 Subject: [PATCH] fix(timezone): harden hourly-rollup rollout against straight-through migrate MUL-2488 (#2998) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(timezone): harden hourly-rollup rollout against straight-through migrate MUL-2488 PR #2968 introduced the new task_usage_hourly rollup but assumed operators would stop migrate between 102 and 103 to run the one-shot cmd/backfill_task_usage_hourly. Two pieces made that unsafe in practice: 1. The Dockerfile only shipped server / multica / migrate, so a deployed container has no backfill binary to run between phases. 2. cmd/migrate has no per-version stop, and entrypoint.sh runs `migrate up` to the latest version, so 103 silently drops the legacy daily rollups even when nobody ran the backfill — leaving usage dashboards at zero despite source data being intact in task_usage. Changes: - Build cmd/backfill_task_usage_hourly into the runtime image alongside the other binaries so operators can `docker exec` the backfill instead of needing a source checkout. - Add a fail-closed plpgsql guard at the top of migration 103 that aborts the migration when task_usage has rows but task_usage_hourly is empty. Fresh databases (no task_usage rows) are exempt because the new triggers from 102 will populate the hourly table on the first event. Already-applied databases are unaffected — schema_migrations tracks by version only, so 103 is not re-run. Co-authored-by: multica-agent * fix(timezone): use watermark coverage for hourly-rollup guard The previous check only required `task_usage_hourly` to be non-empty, which an interrupted backfill or a manual `rollup_task_usage_hourly_window` call both satisfy. The completion signal we actually trust is `task_usage_hourly_rollup_state.watermark_at` — backfill only stamps it to `now() - 5 min` after every monthly slice succeeded, and the cron worker only advances it on a real tick. Default after migration 101 is `1970-01-01`, so an unrun or partial backfill is trivially detected. Also corrects the comment about fresh-install behavior: the triggers in 102 only enqueue dirty keys for agent_task_queue / issue / task_usage DELETE — they do not write hourly rows. INSERT/UPDATE flows through the `updated_at` watermark window of `rollup_task_usage_hourly()`, which only runs once the operator registers it as a pg_cron job. Co-authored-by: multica-agent --------- Co-authored-by: multica-agent --- Dockerfile | 2 + .../103_drop_legacy_daily_rollups.up.sql | 75 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/Dockerfile b/Dockerfile index cbb9f2306..8735c3312 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ ARG COMMIT=unknown RUN cd server && CGO_ENABLED=0 go build -ldflags "-s -w -X main.version=${VERSION} -X main.commit=${COMMIT}" -o bin/server ./cmd/server RUN cd server && CGO_ENABLED=0 go build -ldflags "-s -w -X main.version=${VERSION} -X main.commit=${COMMIT}" -o bin/multica ./cmd/multica RUN cd server && CGO_ENABLED=0 go build -ldflags "-s -w" -o bin/migrate ./cmd/migrate +RUN cd server && CGO_ENABLED=0 go build -ldflags "-s -w" -o bin/backfill_task_usage_hourly ./cmd/backfill_task_usage_hourly # --- Runtime stage --- FROM alpine:3.21 @@ -29,6 +30,7 @@ WORKDIR /app COPY --from=builder /src/server/bin/server . COPY --from=builder /src/server/bin/multica . COPY --from=builder /src/server/bin/migrate . +COPY --from=builder /src/server/bin/backfill_task_usage_hourly . COPY server/migrations/ ./migrations/ COPY docker/entrypoint.sh . RUN sed -i 's/\r$//' entrypoint.sh && chmod +x entrypoint.sh diff --git a/server/migrations/103_drop_legacy_daily_rollups.up.sql b/server/migrations/103_drop_legacy_daily_rollups.up.sql index ad41579e1..f3263234d 100644 --- a/server/migrations/103_drop_legacy_daily_rollups.up.sql +++ b/server/migrations/103_drop_legacy_daily_rollups.up.sql @@ -20,6 +20,81 @@ -- so the migration still succeeds on instances without pg_cron at all -- (same guard pattern as migration 076). +-- --------------------------------------------------------------------------- +-- Fail-closed guard: refuse to drop the legacy daily pipelines unless the +-- hourly rollup has actually been seeded across the existing `task_usage` +-- history. `migrate up` runs the migration set straight through (no +-- per-version stop), so a self-host operator who skips the documented +-- backfill step would otherwise silently land in a state where dashboards +-- show zeros (see SELF-HOST UPGRADE ORDER in +-- cmd/backfill_task_usage_hourly/main.go and +-- docs/timezone-architecture-rfc.md §6 / §7.1). Failing loud here is the +-- only thing that turns an undetected outage into a clear migration error. +-- +-- The completion signal we trust is task_usage_hourly_rollup_state.watermark_at: +-- +-- * cmd/backfill_task_usage_hourly stamps it to `now() - 5 minutes` only +-- after every monthly slice succeeded (server/cmd/backfill_task_usage_hourly/main.go). +-- * rollup_task_usage_hourly() (the pg_cron entry) advances it on every +-- tick (102_task_usage_hourly_pipeline.up.sql). +-- * The default after migration 101 is `1970-01-01`, so an unrun or +-- interrupted backfill is trivially detected. +-- +-- A non-empty `task_usage_hourly` is NOT a safe proxy for "backfill done": +-- the triggers in 102 only enqueue dirty keys on agent_task_queue / +-- issue / task_usage DELETE — they do not write hourly rows themselves, +-- and they fire only on writes since 102 landed. A backfill that +-- crashed mid-run, or a manual `rollup_task_usage_hourly_window` call, +-- both leave the hourly table non-empty but with a stale watermark and +-- partial history; the legacy rollups would still be the only complete +-- read path. +-- +-- A fresh database (no rows in task_usage) is exempt — there is no +-- history to backfill, and rollup_task_usage_hourly() (registered as a +-- pg_cron job by the operator) will populate task_usage_hourly from the +-- first event forward via its updated_at watermark window. +-- --------------------------------------------------------------------------- + +DO $$ +DECLARE + v_watermark TIMESTAMPTZ; + v_max_event TIMESTAMPTZ; + v_lag INTERVAL := INTERVAL '1 hour'; +BEGIN + IF NOT EXISTS (SELECT 1 FROM task_usage LIMIT 1) THEN + RETURN; + END IF; + + SELECT watermark_at INTO v_watermark + FROM task_usage_hourly_rollup_state + WHERE id = 1; + + IF v_watermark IS NULL THEN + RAISE EXCEPTION + 'refusing to drop legacy daily rollups: task_usage_hourly_rollup_state row is missing — apply migrations 100-102 first, then run cmd/backfill_task_usage_hourly'; + END IF; + + SELECT MAX(COALESCE(updated_at, created_at)) INTO v_max_event FROM task_usage; + + -- A successful cmd/backfill_task_usage_hourly stamps watermark_at to + -- `now() - 5 min`; once pg_cron is registered the worker keeps it + -- within a similar window. If the watermark trails the latest event + -- by more than v_lag, one of these went wrong: + -- * the backfill was never run (watermark stuck at 1970-01-01), + -- * the backfill was interrupted before stampWatermark ran, + -- * someone hand-seeded task_usage_hourly without stamping, + -- * pg_cron has been off long enough to drift past the cap. + -- In every case, dropping the legacy rollups now would remove the + -- only read path for buckets the hourly pipeline has not proven it + -- owns yet. + IF v_watermark < v_max_event - v_lag THEN + RAISE EXCEPTION + 'refusing to drop legacy daily rollups: task_usage_hourly_rollup_state.watermark_at (%) trails task_usage latest event (%) by more than % — backfill is incomplete or pg_cron is not running. Run cmd/backfill_task_usage_hourly (and let pg_cron catch up) before re-running migrate (see SELF-HOST UPGRADE ORDER in cmd/backfill_task_usage_hourly/main.go).', + v_watermark, v_max_event, v_lag; + END IF; +END +$$; + -- --------------------------------------------------------------------------- -- Unschedule the legacy pg_cron jobs first (no-op when pg_cron is absent -- or the job was never registered).