Compare commits

...

2 Commits

Author SHA1 Message Date
J
034feea8f7 fix(daemon): honor task-deleted signal in post-runTask completion guard
The final pre-completion check in handleTask only looked for
status == "cancelled" and ignored errors. After PR #2107 added a 404
task-deleted cancellation path to the in-flight watcher, this trailing
guard fell out of sync — if the task was deleted between the watcher's
last poll and runTask returning, handleTask would still try to call
CompleteTask and only learn about the deletion via the 404 from that
callback.

Reuse shouldInterruptAgent so the same truth table (cancelled OR
404 task-not-found, but NOT transient errors) drives both polling and
the final guard.

Co-authored-by: multica-agent <github@multica.ai>
2026-05-06 15:50:07 +08:00
J
e3b4bc07f9 fix(server): return 500 for transient DB errors in daemon task lookup
requireDaemonTaskAccess used to turn any GetAgentTask error into
404 "task not found", including transient DB connection / pool errors.
Combined with PR #2107 — which added 404+"task not found" as a daemon
cancellation trigger — that means a single DB hiccup could kill an
in-flight agent run.

Distinguish pgx.ErrNoRows (real "task gone", 404) from other errors
(transient, 500 + warn log) using the existing isNotFound helper.

Tests cover both paths via the mockDB pattern already used by
TestFindOrCreateUserGating.

Co-authored-by: multica-agent <github@multica.ai>
2026-05-06 15:49:59 +08:00
3 changed files with 62 additions and 6 deletions

View File

@@ -1463,11 +1463,12 @@ func (d *Daemon) handleTask(ctx context.Context, task Task, slot int) {
_ = d.client.ReportProgress(ctx, task.ID, "Finishing task", 2, 2)
// Check if the task was cancelled while it was running (e.g. issue
// was reassigned). If so, skip reporting results — the server already
// moved the task to 'cancelled' so complete/fail would fail anyway.
if status, err := d.client.GetTaskStatus(ctx, task.ID); err == nil && status == "cancelled" {
taskLog.Info("task cancelled during execution, discarding result")
// Final pre-completion check: if the server already moved the task to
// "cancelled" or deleted the row outright, skip reporting — the
// complete/fail callbacks would fail anyway. Reuse shouldInterruptAgent
// so this guard honors the same signals as the in-flight watcher.
if status, err := d.client.GetTaskStatus(ctx, task.ID); shouldInterruptAgent(status, err) {
taskLog.Info("task cancelled during execution, discarding result", "status", status, "error", err)
return
}

View File

@@ -78,7 +78,15 @@ func (h *Handler) requireDaemonTaskAccess(w http.ResponseWriter, r *http.Request
}
task, err := h.Queries.GetAgentTask(r.Context(), taskUUID)
if err != nil {
writeError(w, http.StatusNotFound, "task not found")
// Only treat pgx.ErrNoRows as a real "task gone" signal — daemon
// uses this 404 to interrupt the running agent, so a transient DB
// error must not be reported as a deletion.
if isNotFound(err) {
writeError(w, http.StatusNotFound, "task not found")
return db.AgentTaskQueue{}, false
}
slog.Warn("get agent task failed", "task_id", taskID, "error", err)
writeError(w, http.StatusInternalServerError, "failed to load task")
return db.AgentTaskQueue{}, false
}

View File

@@ -4,6 +4,7 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"strings"
@@ -11,7 +12,9 @@ import (
"time"
"github.com/go-chi/chi/v5"
"github.com/jackc/pgx/v5"
"github.com/multica-ai/multica/server/internal/middleware"
db "github.com/multica-ai/multica/server/pkg/db/generated"
)
// slowProbeLocalSkillListStore wraps a LocalSkillListStore but blocks inside
@@ -323,6 +326,50 @@ func TestGetTaskStatus_WithDaemonToken_CrossWorkspace(t *testing.T) {
}
}
// TestGetTaskStatus_TransientDBError_Returns500 verifies that a transient DB
// error from GetAgentTask is reported as 500 rather than 404. The daemon
// uses 404+"task not found" as a hard cancel signal; a transient lookup
// failure must therefore not be smuggled into that body, otherwise a single
// DB hiccup would kill an in-flight agent.
func TestGetTaskStatus_TransientDBError_Returns500(t *testing.T) {
h := &Handler{}
h.Queries = db.New(&mockDB{getUserErr: errors.New("connection reset by peer")})
w := httptest.NewRecorder()
req := newDaemonTokenRequest("GET", "/api/daemon/tasks/00000000-0000-0000-0000-000000000001/status", nil,
"00000000-0000-0000-0000-000000000000", "test-daemon")
req = withURLParam(req, "taskId", "00000000-0000-0000-0000-000000000001")
h.GetTaskStatus(w, req)
if w.Code != http.StatusInternalServerError {
t.Fatalf("transient DB error: expected 500, got %d: %s", w.Code, w.Body.String())
}
if strings.Contains(w.Body.String(), "task not found") {
t.Fatalf("transient DB error must not surface as %q (daemon treats that as a deletion): %s", "task not found", w.Body.String())
}
}
// TestGetTaskStatus_ErrNoRows_Returns404 verifies that an actually-missing
// task row still returns the 404+"task not found" body the daemon relies on
// to interrupt the running agent.
func TestGetTaskStatus_ErrNoRows_Returns404(t *testing.T) {
h := &Handler{}
h.Queries = db.New(&mockDB{getUserErr: pgx.ErrNoRows})
w := httptest.NewRecorder()
req := newDaemonTokenRequest("GET", "/api/daemon/tasks/00000000-0000-0000-0000-000000000001/status", nil,
"00000000-0000-0000-0000-000000000000", "test-daemon")
req = withURLParam(req, "taskId", "00000000-0000-0000-0000-000000000001")
h.GetTaskStatus(w, req)
if w.Code != http.StatusNotFound {
t.Fatalf("ErrNoRows: expected 404, got %d: %s", w.Code, w.Body.String())
}
if !strings.Contains(w.Body.String(), "task not found") {
t.Fatalf("ErrNoRows: expected body to contain %q, got %s", "task not found", w.Body.String())
}
}
func TestGetIssueGCCheck_WithDaemonToken_CrossWorkspace(t *testing.T) {
if testHandler == nil {
t.Skip("database not available")