fix(scheduler): floor next_run_at advance at plan_time to survive clock skew

Addresses review feedback on the next_run_at write-back (MUL-3749): - The post-dispatch advance computed the value from time.Now() alone. The handler is entered only after DB time judged the plan due, so if this app instance's clock lags the DB clock at a period boundary, time.Now() could recompute the slot that just fired and next_run_at would not advance — the original staleness bug, at the boundary. Extract advancedNextRun, which anchors at max(now, plan_time) via NextOccurrenceAfterUTC so the written value is always strictly after the fired plan_time while still tracking the local clock in the normal case. - Add scheduler-layer tests asserting the written value is strictly after plan_time across skew / on-slot / normal cases. The previous service-layer test only exercised the helper with an explicit after, not this path. - Sync the stale ListSchedulableAutopilotTriggers comment: the scheduler now writes last_fired_at via AdvanceTriggerNextRun (sqlc regenerated). MUL-3749 Co-authored-by: multica-agent <github@multica.ai>
fix(scheduler): advance autopilot next_run_at after each scheduled dispatch
2026-06-27 01:19:26 +02:00 · 2026-06-26 18:31:32 +08:00 · 2026-06-26 17:38:45 +08:00
6 changed files with 148 additions and 14 deletions
--- a/server/internal/scheduler/jobs_autopilot.go
+++ b/server/internal/scheduler/jobs_autopilot.go
@@ -361,10 +361,29 @@ func autopilotHandler(
 			return HandlerResult{}, fmt.Errorf("dispatch for plan: %w", err)
 		}

-		// Bump the display-only last_fired_at so the trigger UI shows
-		// the most recent fire time. Errors are not fatal — the
-		// canonical record is autopilot_run.created_at.
-		_ = queries.TouchAutopilotTriggerFiredAt(ctx, trigger.ID)
+		// Advance the display-only next_run_at to the upcoming slot and
+		// bump last_fired_at in the same write, so the trigger UI stops
+		// showing a "next run" that has already fired (MUL-3749). The
+		// value stays on the app local clock — consistent with the trigger
+		// create/update display path — but is floored at the plan_time
+		// that just fired (see advancedNextRun), so a lagging app clock can
+		// never recompute the slot we just dispatched. Errors are not
+		// fatal: the canonical record is autopilot_run.created_at and the
+		// next dispatch refreshes the value regardless; on a cron/timezone
+		// parse failure we fall back to the last_fired_at-only bump so the
+		// fire is still recorded.
+		tz := DefaultAutopilotScheduleTimezone
+		if trigger.Timezone.Valid && trigger.Timezone.String != "" {
+			tz = trigger.Timezone.String
+		}
+		if next, ok := advancedNextRun(trigger.CronExpression.String, tz, in.PlanTime, time.Now()); ok {
+			_ = queries.AdvanceTriggerNextRun(ctx, db.AdvanceTriggerNextRunParams{
+				ID:        trigger.ID,
+				NextRunAt: pgtype.Timestamptz{Time: next, Valid: true},
+			})
+		} else {
+			_ = queries.TouchAutopilotTriggerFiredAt(ctx, trigger.ID)
+		}

 		return HandlerResult{
 			RowsAffected: 1,
@@ -376,6 +395,28 @@ func autopilotHandler(
 	}
 }

+// advancedNextRun computes the display-only next_run_at value to write
+// after a schedule trigger fires at planTime. It evaluates the cron on the
+// app local clock (`now`, normally time.Now()) but anchors at
+// max(now, planTime), so the result is always strictly after the slot that
+// just fired — even when this app instance's clock lags the DB clock that
+// judged the plan due. Without that floor a sub-second-late local clock
+// could recompute the same slot and the next_run_at staleness bug
+// (MUL-3749) would reappear at the top-of-period boundary. Returns
+// ok=false when the cron/timezone fail to parse, signalling the caller to
+// fall back to a last_fired_at-only bump.
+func advancedNextRun(cronExpr, timezone string, planTime, now time.Time) (time.Time, bool) {
+	anchor := now
+	if planTime.After(anchor) {
+		anchor = planTime
+	}
+	next, err := service.NextOccurrenceAfterUTC(cronExpr, timezone, anchor)
+	if err != nil {
+		return time.Time{}, false
+	}
+	return next, true
+}
+
 // parseScopeUUID converts a scope.ID string back into pgtype.UUID.
 // We accept both the standard hyphenated form and the unhyphenated
 // raw 32-hex form to keep the contract forgiving.
--- a/server/internal/scheduler/jobs_autopilot_test.go
+++ b/server/internal/scheduler/jobs_autopilot_test.go
@@ -0,0 +1,58 @@
+package scheduler
+
+import (
+	"testing"
+	"time"
+)
+
+// TestAdvancedNextRunStrictlyAfterPlanTime is the regression guard for
+// MUL-3749's boundary case: the post-dispatch next_run_at write-back must
+// land on the slot AFTER the one that just fired, even when this app
+// instance's local clock lags the DB clock that judged the plan due.
+// Anchoring naively on time.Now() alone could recompute the just-fired
+// slot; advancedNextRun floors the anchor at plan_time to prevent it.
+// Uses the reported scenario: hourly cron in America/New_York, fired slot
+// 03:00 EDT (07:00 UTC), next slot 04:00 EDT (08:00 UTC).
+func TestAdvancedNextRunStrictlyAfterPlanTime(t *testing.T) {
+	const cron = "0 * * * *"
+	const tz = "America/New_York"
+	planTime := time.Date(2026, 6, 26, 7, 0, 0, 0, time.UTC)
+	want := time.Date(2026, 6, 26, 8, 0, 0, 0, time.UTC)
+
+	cases := []struct {
+		name string
+		now  time.Time
+	}{
+		{"app clock lags the fired slot (skew)", planTime.Add(-90 * time.Second)},
+		{"app clock exactly on the fired slot", planTime},
+		{"app clock just after the fired slot (normal)", planTime.Add(5 * time.Second)},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got, ok := advancedNextRun(cron, tz, planTime, tc.now)
+			if !ok {
+				t.Fatal("expected ok=true for a valid cron/timezone")
+			}
+			if !got.Equal(want) {
+				t.Fatalf("got %s, want %s", got.Format(time.RFC3339), want.Format(time.RFC3339))
+			}
+			if !got.After(planTime) {
+				t.Fatalf("next_run_at %s must be strictly after the fired plan_time %s",
+					got.Format(time.RFC3339), planTime.Format(time.RFC3339))
+			}
+		})
+	}
+}
+
+// TestAdvancedNextRunInvalidInputsSignalFallback verifies the helper
+// reports ok=false (so the handler falls back to the last_fired_at-only
+// bump) when the cron or timezone cannot be parsed.
+func TestAdvancedNextRunInvalidInputsSignalFallback(t *testing.T) {
+	planTime := time.Date(2026, 6, 26, 7, 0, 0, 0, time.UTC)
+	if _, ok := advancedNextRun("not a cron", "UTC", planTime, planTime); ok {
+		t.Fatal("expected ok=false for an invalid cron expression")
+	}
+	if _, ok := advancedNextRun("0 * * * *", "Mars/Olympus", planTime, planTime); ok {
+		t.Fatal("expected ok=false for an invalid timezone")
+	}
+}
--- a/server/internal/service/cron.go
+++ b/server/internal/service/cron.go
@@ -60,15 +60,22 @@ func NextOccurrencesUTC(cronExpr, timezone string, after, until time.Time) ([]ti
 	return out, nil
 }

-// ComputeNextRun is the legacy wrapper used by the trigger create/update
-// handlers and the failure monitor. It evaluates the cron at the app's
-// local now() — kept for the display-only autopilot_trigger.next_run_at
-// column so we do not have to thread DB time through every UI write
-// path in this same change. Scheduling decisions MUST go through
-// NextOccurrencesUTC against DB time instead.
+// ComputeNextRun evaluates the cron at the app's local now() and backs
+// the display-only autopilot_trigger.next_run_at column for the trigger
+// create/update handlers and the failure monitor. Using the local clock
+// for this display value is deliberate: app/DB clock skew under NTP is far
+// below the column's minute-level granularity, so threading DB time
+// through these UI write paths would buy no user-visible accuracy.
 //
-// MUL-3551: this function is on its way out; new callers should use
-// NextOccurrenceAfterUTC with a db_now() input instead.
+// The scheduler's post-dispatch advance keeps next_run_at on the same
+// local clock but calls NextOccurrenceAfterUTC anchored at
+// max(now, plan_time) rather than this helper, so a lagging app clock can
+// never re-point the column at the slot that just fired (see
+// scheduler.advancedNextRun / autopilotHandler, MUL-3749).
+//
+// Scheduling decisions are a separate concern and MUST go through
+// NextOccurrencesUTC / NextOccurrenceAfterUTC against DB time instead:
+// dispatch correctness across clock-skewed app instances depends on it.
 func ComputeNextRun(cronExpr, timezone string) (time.Time, error) {
 	return NextOccurrenceAfterUTC(cronExpr, timezone, time.Now())
 }
--- a/server/internal/service/cron_test.go
+++ b/server/internal/service/cron_test.go
@@ -134,3 +134,29 @@ func TestNextOccurrenceAfterUTCIgnoresWallClock(t *testing.T) {
 		t.Fatalf("got %s, want %s", got.Format(time.RFC3339), want.Format(time.RFC3339))
 	}
 }
+
+// TestNextOccurrenceAdvancesPastFiredSlot locks in the property the
+// scheduler's next_run_at write-back relies on (MUL-3749): once a
+// recurring trigger fires at a slot, the next computed occurrence is the
+// FOLLOWING slot, strictly after the one that just fired — never the same
+// instant. Regression guard for the bug where next_run_at froze at a past
+// slot and the list rendered it as "53m ago". Uses the exact reported
+// scenario: hourly cron in America/New_York, slot 03:00 EDT (07:00 UTC).
+func TestNextOccurrenceAdvancesPastFiredSlot(t *testing.T) {
+	const cron = "0 * * * *" // every hour, on the hour
+	const tz = "America/New_York"
+
+	fired := time.Date(2026, 6, 26, 7, 0, 0, 0, time.UTC) // 03:00 EDT
+	want := time.Date(2026, 6, 26, 8, 0, 0, 0, time.UTC)  // 04:00 EDT
+
+	got, err := NextOccurrenceAfterUTC(cron, tz, fired)
+	if err != nil {
+		t.Fatalf("NextOccurrenceAfterUTC: %v", err)
+	}
+	if !got.Equal(want) {
+		t.Fatalf("got %s, want %s", got.Format(time.RFC3339), want.Format(time.RFC3339))
+	}
+	if !got.After(fired) {
+		t.Fatalf("next occurrence %s must be strictly after the fired slot %s", got, fired)
+	}
+}
--- a/server/pkg/db/generated/autopilot.sql.go
+++ b/server/pkg/db/generated/autopilot.sql.go
@@ -837,7 +837,8 @@ type ListSchedulableAutopilotTriggersRow struct {
 // last_fired_at is read so the planner hook can anchor cold-start
 // enumeration on the most recent successful fire (set by either the
 // legacy goroutine before the new scheduler took over, or the new
-// scheduler's own TouchAutopilotTriggerFiredAt call). Without it,
+// scheduler's own post-dispatch advance — AdvanceTriggerNextRun, falling
+// back to TouchAutopilotTriggerFiredAt on a cron parse error). Without it,
 // a trigger that was created days ago and fired by the legacy code
 // looks like a brand-new trigger to the new scheduler on first tick
 // and the half-open `(created_at, now]` enumeration replays the most
--- a/server/pkg/db/queries/autopilot.sql
+++ b/server/pkg/db/queries/autopilot.sql
@@ -301,7 +301,8 @@ RETURNING *;
 -- last_fired_at is read so the planner hook can anchor cold-start
 -- enumeration on the most recent successful fire (set by either the
 -- legacy goroutine before the new scheduler took over, or the new
-- scheduler's own TouchAutopilotTriggerFiredAt call). Without it,
+-- scheduler's own post-dispatch advance — AdvanceTriggerNextRun, falling
+-- back to TouchAutopilotTriggerFiredAt on a cron parse error). Without it,
 -- a trigger that was created days ago and fired by the legacy code
 -- looks like a brand-new trigger to the new scheduler on first tick
 -- and the half-open `(created_at, now]` enumeration replays the most