fix(runtime): pause autopilots inside the runtime-delete teardown transaction

DeleteAgentRuntime paused autopilots for the runtime's archived agents
just outside the teardown transaction, so a pause that succeeded before a
later delete failed (and rolled back) left autopilots paused while the
runtime survived. Move ListArchivedAgentIDsByRuntime +
PauseAutopilotsByAgentAssignees inside the tx via qtx and treat a pause
error as a hard failure, matching ArchiveAgentsAndDeleteRuntime.

Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
J
2026-06-08 13:10:56 +08:00
parent 8abdc77961
commit 04e82bda1f

View File

@@ -587,25 +587,6 @@ func (h *Handler) DeleteAgentRuntime(w http.ResponseWriter, r *http.Request) {
return
}
// Pause autopilots pointing at the archived agents BEFORE we delete
// them. Migration 096 dropped the autopilot.assignee_id agent FK, so a
// hard-delete here would otherwise leave dangling rows that subsequent
// scheduler ticks would skip with "assignee agent no longer exists" —
// quiet, but burning a run record every tick until an operator notices.
// Pausing makes the breakage visible in the autopilot list so the owner
// can re-point or delete the row instead.
archivedAgentIDs, err := h.Queries.ListArchivedAgentIDsByRuntime(r.Context(), rt.ID)
if err != nil {
writeError(w, http.StatusInternalServerError, "failed to enumerate archived agents")
return
}
if len(archivedAgentIDs) > 0 {
if err := h.Queries.PauseAutopilotsByAgentAssignees(r.Context(), archivedAgentIDs); err != nil {
slog.Warn("pause autopilots for archived agents failed",
"runtime_id", uuidToString(rt.ID), "error", err)
}
}
tx, err := h.TxStarter.Begin(r.Context())
if err != nil {
writeError(w, http.StatusInternalServerError, "failed to delete runtime")
@@ -614,6 +595,27 @@ func (h *Handler) DeleteAgentRuntime(w http.ResponseWriter, r *http.Request) {
defer tx.Rollback(r.Context())
qtx := h.Queries.WithTx(tx)
// Pause autopilots pointing at the archived agents BEFORE we delete
// them. Migration 096 dropped the autopilot.assignee_id agent FK, so a
// hard-delete here would otherwise leave dangling rows that subsequent
// scheduler ticks would skip with "assignee agent no longer exists" —
// quiet, but burning a run record every tick until an operator notices.
// Pausing makes the breakage visible in the autopilot list so the owner
// can re-point or delete the row instead. This runs inside the teardown
// transaction so a pause that lands but is followed by a failed delete
// rolls back with everything else, matching ArchiveAgentsAndDeleteRuntime.
archivedAgentIDs, err := qtx.ListArchivedAgentIDsByRuntime(r.Context(), rt.ID)
if err != nil {
writeError(w, http.StatusInternalServerError, "failed to enumerate archived agents")
return
}
if len(archivedAgentIDs) > 0 {
if err := qtx.PauseAutopilotsByAgentAssignees(r.Context(), archivedAgentIDs); err != nil {
writeError(w, http.StatusInternalServerError, "failed to pause autopilots")
return
}
}
// Remove archived squads whose leader is an archived agent on this runtime
// so the RESTRICT FK on squad.leader_id won't block the subsequent agent
// deletion. Active squads are handled by the 409 guard above instead.