mirror of
https://github.com/multica-ai/multica.git
synced 2026-06-17 11:48:42 +02:00
fix(runtime): pause autopilots inside the runtime-delete teardown transaction
DeleteAgentRuntime paused autopilots for the runtime's archived agents just outside the teardown transaction, so a pause that succeeded before a later delete failed (and rolled back) left autopilots paused while the runtime survived. Move ListArchivedAgentIDsByRuntime + PauseAutopilotsByAgentAssignees inside the tx via qtx and treat a pause error as a hard failure, matching ArchiveAgentsAndDeleteRuntime. Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
@@ -587,25 +587,6 @@ func (h *Handler) DeleteAgentRuntime(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
// Pause autopilots pointing at the archived agents BEFORE we delete
|
||||
// them. Migration 096 dropped the autopilot.assignee_id agent FK, so a
|
||||
// hard-delete here would otherwise leave dangling rows that subsequent
|
||||
// scheduler ticks would skip with "assignee agent no longer exists" —
|
||||
// quiet, but burning a run record every tick until an operator notices.
|
||||
// Pausing makes the breakage visible in the autopilot list so the owner
|
||||
// can re-point or delete the row instead.
|
||||
archivedAgentIDs, err := h.Queries.ListArchivedAgentIDsByRuntime(r.Context(), rt.ID)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to enumerate archived agents")
|
||||
return
|
||||
}
|
||||
if len(archivedAgentIDs) > 0 {
|
||||
if err := h.Queries.PauseAutopilotsByAgentAssignees(r.Context(), archivedAgentIDs); err != nil {
|
||||
slog.Warn("pause autopilots for archived agents failed",
|
||||
"runtime_id", uuidToString(rt.ID), "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
tx, err := h.TxStarter.Begin(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to delete runtime")
|
||||
@@ -614,6 +595,27 @@ func (h *Handler) DeleteAgentRuntime(w http.ResponseWriter, r *http.Request) {
|
||||
defer tx.Rollback(r.Context())
|
||||
qtx := h.Queries.WithTx(tx)
|
||||
|
||||
// Pause autopilots pointing at the archived agents BEFORE we delete
|
||||
// them. Migration 096 dropped the autopilot.assignee_id agent FK, so a
|
||||
// hard-delete here would otherwise leave dangling rows that subsequent
|
||||
// scheduler ticks would skip with "assignee agent no longer exists" —
|
||||
// quiet, but burning a run record every tick until an operator notices.
|
||||
// Pausing makes the breakage visible in the autopilot list so the owner
|
||||
// can re-point or delete the row instead. This runs inside the teardown
|
||||
// transaction so a pause that lands but is followed by a failed delete
|
||||
// rolls back with everything else, matching ArchiveAgentsAndDeleteRuntime.
|
||||
archivedAgentIDs, err := qtx.ListArchivedAgentIDsByRuntime(r.Context(), rt.ID)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to enumerate archived agents")
|
||||
return
|
||||
}
|
||||
if len(archivedAgentIDs) > 0 {
|
||||
if err := qtx.PauseAutopilotsByAgentAssignees(r.Context(), archivedAgentIDs); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "failed to pause autopilots")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Remove archived squads whose leader is an archived agent on this runtime
|
||||
// so the RESTRICT FK on squad.leader_id won't block the subsequent agent
|
||||
// deletion. Active squads are handled by the 409 guard above instead.
|
||||
|
||||
Reference in New Issue
Block a user