Compare commits

...

2 Commits

Author SHA1 Message Date
J
e4448f146f fix(scheduler): floor next_run_at advance at plan_time to survive clock skew
Addresses review feedback on the next_run_at write-back (MUL-3749):

- The post-dispatch advance computed the value from time.Now() alone. The
  handler is entered only after DB time judged the plan due, so if this app
  instance's clock lags the DB clock at a period boundary, time.Now() could
  recompute the slot that just fired and next_run_at would not advance —
  the original staleness bug, at the boundary. Extract advancedNextRun,
  which anchors at max(now, plan_time) via NextOccurrenceAfterUTC so the
  written value is always strictly after the fired plan_time while still
  tracking the local clock in the normal case.
- Add scheduler-layer tests asserting the written value is strictly after
  plan_time across skew / on-slot / normal cases. The previous service-layer
  test only exercised the helper with an explicit after, not this path.
- Sync the stale ListSchedulableAutopilotTriggers comment: the scheduler
  now writes last_fired_at via AdvanceTriggerNextRun (sqlc regenerated).

MUL-3749

Co-authored-by: multica-agent <github@multica.ai>
2026-06-26 18:31:32 +08:00
J
05a4ee013d fix(scheduler): advance autopilot next_run_at after each scheduled dispatch
The display-only autopilot_trigger.next_run_at column was written only on
trigger create/update and never advanced afterward, so for a recurring
schedule it froze at a past slot and the list rendered it as a 'next run'
in the past (e.g. '53m ago'). The intended AdvanceTriggerNextRun query was
dead code with zero callers.

Wire it up at the scheduler's existing post-dispatch seam (replacing the
last_fired_at-only TouchAutopilotTriggerFiredAt bump, which AdvanceTrigger-
NextRun already supersets). The advanced value is computed on the app local
clock via ComputeNextRun — the same path create/update use — so the whole
next_run_at display column is owned by one clock and stays consistent;
scheduling itself is untouched and still runs off DB time via
NextOccurrencesUTC. On a cron/timezone parse failure we fall back to the
last_fired_at-only bump.

Adds a deterministic regression test for the reported scenario (hourly
cron in America/New_York) and documents the local-clock ownership on
ComputeNextRun.

MUL-3749

Co-authored-by: multica-agent <github@multica.ai>
2026-06-26 17:38:45 +08:00
6 changed files with 148 additions and 14 deletions

View File

@@ -361,10 +361,29 @@ func autopilotHandler(
return HandlerResult{}, fmt.Errorf("dispatch for plan: %w", err)
}
// Bump the display-only last_fired_at so the trigger UI shows
// the most recent fire time. Errors are not fatal — the
// canonical record is autopilot_run.created_at.
_ = queries.TouchAutopilotTriggerFiredAt(ctx, trigger.ID)
// Advance the display-only next_run_at to the upcoming slot and
// bump last_fired_at in the same write, so the trigger UI stops
// showing a "next run" that has already fired (MUL-3749). The
// value stays on the app local clock — consistent with the trigger
// create/update display path — but is floored at the plan_time
// that just fired (see advancedNextRun), so a lagging app clock can
// never recompute the slot we just dispatched. Errors are not
// fatal: the canonical record is autopilot_run.created_at and the
// next dispatch refreshes the value regardless; on a cron/timezone
// parse failure we fall back to the last_fired_at-only bump so the
// fire is still recorded.
tz := DefaultAutopilotScheduleTimezone
if trigger.Timezone.Valid && trigger.Timezone.String != "" {
tz = trigger.Timezone.String
}
if next, ok := advancedNextRun(trigger.CronExpression.String, tz, in.PlanTime, time.Now()); ok {
_ = queries.AdvanceTriggerNextRun(ctx, db.AdvanceTriggerNextRunParams{
ID: trigger.ID,
NextRunAt: pgtype.Timestamptz{Time: next, Valid: true},
})
} else {
_ = queries.TouchAutopilotTriggerFiredAt(ctx, trigger.ID)
}
return HandlerResult{
RowsAffected: 1,
@@ -376,6 +395,28 @@ func autopilotHandler(
}
}
// advancedNextRun computes the display-only next_run_at value to write
// after a schedule trigger fires at planTime. It evaluates the cron on the
// app local clock (`now`, normally time.Now()) but anchors at
// max(now, planTime), so the result is always strictly after the slot that
// just fired — even when this app instance's clock lags the DB clock that
// judged the plan due. Without that floor a sub-second-late local clock
// could recompute the same slot and the next_run_at staleness bug
// (MUL-3749) would reappear at the top-of-period boundary. Returns
// ok=false when the cron/timezone fail to parse, signalling the caller to
// fall back to a last_fired_at-only bump.
func advancedNextRun(cronExpr, timezone string, planTime, now time.Time) (time.Time, bool) {
anchor := now
if planTime.After(anchor) {
anchor = planTime
}
next, err := service.NextOccurrenceAfterUTC(cronExpr, timezone, anchor)
if err != nil {
return time.Time{}, false
}
return next, true
}
// parseScopeUUID converts a scope.ID string back into pgtype.UUID.
// We accept both the standard hyphenated form and the unhyphenated
// raw 32-hex form to keep the contract forgiving.

View File

@@ -0,0 +1,58 @@
package scheduler
import (
"testing"
"time"
)
// TestAdvancedNextRunStrictlyAfterPlanTime is the regression guard for
// MUL-3749's boundary case: the post-dispatch next_run_at write-back must
// land on the slot AFTER the one that just fired, even when this app
// instance's local clock lags the DB clock that judged the plan due.
// Anchoring naively on time.Now() alone could recompute the just-fired
// slot; advancedNextRun floors the anchor at plan_time to prevent it.
// Uses the reported scenario: hourly cron in America/New_York, fired slot
// 03:00 EDT (07:00 UTC), next slot 04:00 EDT (08:00 UTC).
func TestAdvancedNextRunStrictlyAfterPlanTime(t *testing.T) {
const cron = "0 * * * *"
const tz = "America/New_York"
planTime := time.Date(2026, 6, 26, 7, 0, 0, 0, time.UTC)
want := time.Date(2026, 6, 26, 8, 0, 0, 0, time.UTC)
cases := []struct {
name string
now time.Time
}{
{"app clock lags the fired slot (skew)", planTime.Add(-90 * time.Second)},
{"app clock exactly on the fired slot", planTime},
{"app clock just after the fired slot (normal)", planTime.Add(5 * time.Second)},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got, ok := advancedNextRun(cron, tz, planTime, tc.now)
if !ok {
t.Fatal("expected ok=true for a valid cron/timezone")
}
if !got.Equal(want) {
t.Fatalf("got %s, want %s", got.Format(time.RFC3339), want.Format(time.RFC3339))
}
if !got.After(planTime) {
t.Fatalf("next_run_at %s must be strictly after the fired plan_time %s",
got.Format(time.RFC3339), planTime.Format(time.RFC3339))
}
})
}
}
// TestAdvancedNextRunInvalidInputsSignalFallback verifies the helper
// reports ok=false (so the handler falls back to the last_fired_at-only
// bump) when the cron or timezone cannot be parsed.
func TestAdvancedNextRunInvalidInputsSignalFallback(t *testing.T) {
planTime := time.Date(2026, 6, 26, 7, 0, 0, 0, time.UTC)
if _, ok := advancedNextRun("not a cron", "UTC", planTime, planTime); ok {
t.Fatal("expected ok=false for an invalid cron expression")
}
if _, ok := advancedNextRun("0 * * * *", "Mars/Olympus", planTime, planTime); ok {
t.Fatal("expected ok=false for an invalid timezone")
}
}

View File

@@ -60,15 +60,22 @@ func NextOccurrencesUTC(cronExpr, timezone string, after, until time.Time) ([]ti
return out, nil
}
// ComputeNextRun is the legacy wrapper used by the trigger create/update
// handlers and the failure monitor. It evaluates the cron at the app's
// local now() — kept for the display-only autopilot_trigger.next_run_at
// column so we do not have to thread DB time through every UI write
// path in this same change. Scheduling decisions MUST go through
// NextOccurrencesUTC against DB time instead.
// ComputeNextRun evaluates the cron at the app's local now() and backs
// the display-only autopilot_trigger.next_run_at column for the trigger
// create/update handlers and the failure monitor. Using the local clock
// for this display value is deliberate: app/DB clock skew under NTP is far
// below the column's minute-level granularity, so threading DB time
// through these UI write paths would buy no user-visible accuracy.
//
// MUL-3551: this function is on its way out; new callers should use
// NextOccurrenceAfterUTC with a db_now() input instead.
// The scheduler's post-dispatch advance keeps next_run_at on the same
// local clock but calls NextOccurrenceAfterUTC anchored at
// max(now, plan_time) rather than this helper, so a lagging app clock can
// never re-point the column at the slot that just fired (see
// scheduler.advancedNextRun / autopilotHandler, MUL-3749).
//
// Scheduling decisions are a separate concern and MUST go through
// NextOccurrencesUTC / NextOccurrenceAfterUTC against DB time instead:
// dispatch correctness across clock-skewed app instances depends on it.
func ComputeNextRun(cronExpr, timezone string) (time.Time, error) {
return NextOccurrenceAfterUTC(cronExpr, timezone, time.Now())
}

View File

@@ -134,3 +134,29 @@ func TestNextOccurrenceAfterUTCIgnoresWallClock(t *testing.T) {
t.Fatalf("got %s, want %s", got.Format(time.RFC3339), want.Format(time.RFC3339))
}
}
// TestNextOccurrenceAdvancesPastFiredSlot locks in the property the
// scheduler's next_run_at write-back relies on (MUL-3749): once a
// recurring trigger fires at a slot, the next computed occurrence is the
// FOLLOWING slot, strictly after the one that just fired — never the same
// instant. Regression guard for the bug where next_run_at froze at a past
// slot and the list rendered it as "53m ago". Uses the exact reported
// scenario: hourly cron in America/New_York, slot 03:00 EDT (07:00 UTC).
func TestNextOccurrenceAdvancesPastFiredSlot(t *testing.T) {
const cron = "0 * * * *" // every hour, on the hour
const tz = "America/New_York"
fired := time.Date(2026, 6, 26, 7, 0, 0, 0, time.UTC) // 03:00 EDT
want := time.Date(2026, 6, 26, 8, 0, 0, 0, time.UTC) // 04:00 EDT
got, err := NextOccurrenceAfterUTC(cron, tz, fired)
if err != nil {
t.Fatalf("NextOccurrenceAfterUTC: %v", err)
}
if !got.Equal(want) {
t.Fatalf("got %s, want %s", got.Format(time.RFC3339), want.Format(time.RFC3339))
}
if !got.After(fired) {
t.Fatalf("next occurrence %s must be strictly after the fired slot %s", got, fired)
}
}

View File

@@ -837,7 +837,8 @@ type ListSchedulableAutopilotTriggersRow struct {
// last_fired_at is read so the planner hook can anchor cold-start
// enumeration on the most recent successful fire (set by either the
// legacy goroutine before the new scheduler took over, or the new
// scheduler's own TouchAutopilotTriggerFiredAt call). Without it,
// scheduler's own post-dispatch advance — AdvanceTriggerNextRun, falling
// back to TouchAutopilotTriggerFiredAt on a cron parse error). Without it,
// a trigger that was created days ago and fired by the legacy code
// looks like a brand-new trigger to the new scheduler on first tick
// and the half-open `(created_at, now]` enumeration replays the most

View File

@@ -301,7 +301,8 @@ RETURNING *;
-- last_fired_at is read so the planner hook can anchor cold-start
-- enumeration on the most recent successful fire (set by either the
-- legacy goroutine before the new scheduler took over, or the new
-- scheduler's own TouchAutopilotTriggerFiredAt call). Without it,
-- scheduler's own post-dispatch advance — AdvanceTriggerNextRun, falling
-- back to TouchAutopilotTriggerFiredAt on a cron parse error). Without it,
-- a trigger that was created days ago and fired by the legacy code
-- looks like a brand-new trigger to the new scheduler on first tick
-- and the half-open `(created_at, now]` enumeration replays the most