Compare commits

...

2 Commits

Author SHA1 Message Date
J
2778967fb8 docs(daemon): fix GetAutopilotRunGCCheck comment — completed_at is not a TTL anchor
The endpoint comment still claimed the daemon uses completed_at as the TTL
anchor for terminal runs. GC now decides purely on terminal status (the
workdir is never reused, so a terminal run is reclaimed on sight);
completed_at is returned for the API contract / diagnostics only. Addresses
the review nit on #4287.

Co-authored-by: multica-agent <github@multica.ai>
2026-06-18 11:17:29 +08:00
J
d38cf891b7 fix(daemon): reclaim autopilot_run workdir on terminal status (MUL-3403)
Autopilot run workdirs are never reused — there is no PriorWorkDir path
that hands a later run the same directory, so every run gets a fresh one.
Yet GC waited the full GCTTL (default 24h) before reclaiming a terminal
run's dir. Combined with one fresh dir per run, high-frequency autopilots
piled up hundreds of stale dirs (508 dirs / 22GB in the field report).

Drop the TTL gate so a terminal run (completed/failed/skipped/
issue_created) is reclaimed immediately, mirroring gcDecisionQuickCreate.
Existing safety constraints are untouched: active-env-root short-circuit,
404 -> orphanByMTime, non-404 error -> skip, and the local_directory
override all still apply.

Co-authored-by: multica-agent <github@multica.ai>
2026-06-18 11:08:08 +08:00
4 changed files with 58 additions and 25 deletions

View File

@@ -391,9 +391,10 @@ func (c *Client) GetChatSessionGCCheck(ctx context.Context, sessionID string) (*
}
// AutopilotRunGCStatus carries the status of an autopilot run. CompletedAt
// is the run's terminal timestamp (zero for non-terminal runs); the GC loop
// uses it as the TTL anchor instead of UpdatedAt because autopilot_run rows
// have no updated_at column.
// is the run's terminal timestamp (zero for non-terminal runs). The GC loop
// reclaims a terminal run's never-reused workdir as soon as it sees the
// terminal status, so it no longer gates on CompletedAt; the field is kept for
// the API response contract and diagnostics.
type AutopilotRunGCStatus struct {
Status string `json:"status"`
CompletedAt time.Time `json:"completed_at"`

View File

@@ -366,24 +366,24 @@ func (d *Daemon) gcDecisionAutopilotRun(ctx context.Context, taskDir string, met
// dead weight from here on.
// Non-terminal: pending, running. Skip until they reach a terminal state
// rather than trying to bound them by mtime — long autopilots are real.
//
// An autopilot run's workdir is never reused: unlike issue/chat tasks there
// is no PriorWorkDir path that hands a later run the same directory, so every
// run gets a fresh one. Whatever the run produced already lives server-side
// (and an issue_created run handed its work to an issue task that owns its own
// envRoot). So the moment the run reaches a terminal state the directory is
// dead weight and we reclaim it immediately, without waiting out GCTTL — the
// same reasoning gcDecisionQuickCreate applies to quick-create dirs. The
// active-env-root short-circuit in shouldCleanTaskDir still protects a run
// that is mid-flight, so this can't pull the rug from under live work.
if isAutopilotRunTerminal(status.Status) {
anchor := status.CompletedAt
if anchor.IsZero() {
// Defensive: terminal status without completed_at means the
// run finished but the column wasn't stamped (older code path).
// Fall back to the meta's CompletedAt so we still GC eventually.
anchor = meta.CompletedAt
}
if !anchor.IsZero() && time.Since(anchor) > d.cfg.GCTTL {
d.logger.Info("gc: eligible for cleanup",
"dir", filepath.Base(taskDir),
"kind", "autopilot_run",
"autopilot_run", meta.AutopilotRunID,
"status", status.Status,
"completed_at", anchor.Format(time.RFC3339),
)
return gcActionClean
}
d.logger.Info("gc: eligible for cleanup",
"dir", filepath.Base(taskDir),
"kind", "autopilot_run",
"autopilot_run", meta.AutopilotRunID,
"status", status.Status,
)
return gcActionClean
}
return gcActionSkip
}

View File

@@ -968,13 +968,44 @@ func TestShouldCleanTaskDir_KindDispatch(t *testing.T) {
want: gcActionSkip,
},
{
name: "autopilot completed within TTL — skip",
name: "autopilot pending — skip",
meta: &execenv.GCMeta{Kind: execenv.GCKindAutopilotRun, AutopilotRunID: runID, WorkspaceID: "ws"},
servers: []serverResp{{
path: "/api/daemon/autopilot-runs/" + runID + "/gc-check",
body: map[string]any{"status": "pending"},
}},
want: gcActionSkip,
},
{
// The directory is never reused, so a terminal run is reclaimed on
// sight — the recent completed_at no longer buys it a 24h reprieve.
name: "autopilot completed within TTL — clean immediately (no 24h gate)",
meta: &execenv.GCMeta{Kind: execenv.GCKindAutopilotRun, AutopilotRunID: runID, WorkspaceID: "ws"},
servers: []serverResp{{
path: "/api/daemon/autopilot-runs/" + runID + "/gc-check",
body: map[string]any{"status": "completed", "completed_at": withinTTL},
}},
want: gcActionSkip,
want: gcActionClean,
},
{
// Terminal status with no completed_at stamp at all still cleans —
// GC keys purely on the terminal status, not on any timestamp.
name: "autopilot skipped with no completed_at — clean",
meta: &execenv.GCMeta{Kind: execenv.GCKindAutopilotRun, AutopilotRunID: runID, WorkspaceID: "ws"},
servers: []serverResp{{
path: "/api/daemon/autopilot-runs/" + runID + "/gc-check",
body: map[string]any{"status": "skipped"},
}},
want: gcActionClean,
},
{
name: "autopilot failed — clean",
meta: &execenv.GCMeta{Kind: execenv.GCKindAutopilotRun, AutopilotRunID: runID, WorkspaceID: "ws"},
servers: []serverResp{{
path: "/api/daemon/autopilot-runs/" + runID + "/gc-check",
body: map[string]any{"status": "failed"},
}},
want: gcActionClean,
},
// ---- quick-create -------------------------------------------------

View File

@@ -2498,9 +2498,10 @@ func (h *Handler) GetChatSessionGCCheck(w http.ResponseWriter, r *http.Request)
}
// GetAutopilotRunGCCheck returns the status and completed_at of an autopilot
// run for the daemon GC loop. autopilot_run has no updated_at column; the
// daemon uses completed_at as the TTL anchor for terminal runs, and treats
// non-terminal status as a skip signal regardless of timestamp.
// run for the daemon GC loop. The daemon decides purely on terminal status:
// an autopilot run's workdir is never reused, so a terminal run is reclaimed on
// sight while non-terminal status is a skip signal — completed_at is returned
// for the API contract and diagnostics, not as a TTL anchor.
//
// Workspace ownership is resolved via the parent autopilot row.
func (h *Handler) GetAutopilotRunGCCheck(w http.ResponseWriter, r *http.Request) {