From 04e82bda1fe78cc80915032d572eb7d58339cd28 Mon Sep 17 00:00:00 2001 From: J Date: Mon, 8 Jun 2026 13:10:56 +0800 Subject: [PATCH] fix(runtime): pause autopilots inside the runtime-delete teardown transaction DeleteAgentRuntime paused autopilots for the runtime's archived agents just outside the teardown transaction, so a pause that succeeded before a later delete failed (and rolled back) left autopilots paused while the runtime survived. Move ListArchivedAgentIDsByRuntime + PauseAutopilotsByAgentAssignees inside the tx via qtx and treat a pause error as a hard failure, matching ArchiveAgentsAndDeleteRuntime. Co-authored-by: multica-agent --- server/internal/handler/runtime.go | 40 ++++++++++++++++-------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/server/internal/handler/runtime.go b/server/internal/handler/runtime.go index bfb19f00d..f3e07dab6 100644 --- a/server/internal/handler/runtime.go +++ b/server/internal/handler/runtime.go @@ -587,25 +587,6 @@ func (h *Handler) DeleteAgentRuntime(w http.ResponseWriter, r *http.Request) { return } - // Pause autopilots pointing at the archived agents BEFORE we delete - // them. Migration 096 dropped the autopilot.assignee_id agent FK, so a - // hard-delete here would otherwise leave dangling rows that subsequent - // scheduler ticks would skip with "assignee agent no longer exists" — - // quiet, but burning a run record every tick until an operator notices. - // Pausing makes the breakage visible in the autopilot list so the owner - // can re-point or delete the row instead. - archivedAgentIDs, err := h.Queries.ListArchivedAgentIDsByRuntime(r.Context(), rt.ID) - if err != nil { - writeError(w, http.StatusInternalServerError, "failed to enumerate archived agents") - return - } - if len(archivedAgentIDs) > 0 { - if err := h.Queries.PauseAutopilotsByAgentAssignees(r.Context(), archivedAgentIDs); err != nil { - slog.Warn("pause autopilots for archived agents failed", - "runtime_id", uuidToString(rt.ID), "error", err) - } - } - tx, err := h.TxStarter.Begin(r.Context()) if err != nil { writeError(w, http.StatusInternalServerError, "failed to delete runtime") @@ -614,6 +595,27 @@ func (h *Handler) DeleteAgentRuntime(w http.ResponseWriter, r *http.Request) { defer tx.Rollback(r.Context()) qtx := h.Queries.WithTx(tx) + // Pause autopilots pointing at the archived agents BEFORE we delete + // them. Migration 096 dropped the autopilot.assignee_id agent FK, so a + // hard-delete here would otherwise leave dangling rows that subsequent + // scheduler ticks would skip with "assignee agent no longer exists" — + // quiet, but burning a run record every tick until an operator notices. + // Pausing makes the breakage visible in the autopilot list so the owner + // can re-point or delete the row instead. This runs inside the teardown + // transaction so a pause that lands but is followed by a failed delete + // rolls back with everything else, matching ArchiveAgentsAndDeleteRuntime. + archivedAgentIDs, err := qtx.ListArchivedAgentIDsByRuntime(r.Context(), rt.ID) + if err != nil { + writeError(w, http.StatusInternalServerError, "failed to enumerate archived agents") + return + } + if len(archivedAgentIDs) > 0 { + if err := qtx.PauseAutopilotsByAgentAssignees(r.Context(), archivedAgentIDs); err != nil { + writeError(w, http.StatusInternalServerError, "failed to pause autopilots") + return + } + } + // Remove archived squads whose leader is an archived agent on this runtime // so the RESTRICT FK on squad.leader_id won't block the subsequent agent // deletion. Active squads are handled by the 409 guard above instead.