Files
multica/server/internal/handler/runtime_models.go
Bohan Jiang f628e48775 refactor(server): error-returning ParseUUID to prevent silent data loss
* refactor(server): make ParseUUID error-returning to prevent silent data loss (MUL-1410)

util.ParseUUID previously swallowed errors and returned a zero pgtype.UUID
on invalid input. When this zero UUID reached a write query (DELETE/UPDATE),
the SQL matched zero rows and the handler returned 2xx success — producing
silent data corruption. #1661 (DeleteIssue with identifier-style ID) was the
visible symptom; PR #1680 patched that one site, this commit closes the
class of bug.

Changes:

- util.ParseUUID now returns (pgtype.UUID, error). Add util.MustParseUUID
  for trusted round-trips that should panic on invalid input.
- handler/handler.go: parseUUID wrapper now calls MustParseUUID — any
  unguarded user-input string reaching it surfaces as a recovered panic
  (chi middleware.Recoverer → 500) instead of silently corrupting data.
  Add parseUUIDOrBadRequest(w, s, fieldName) for handler entry points.
- Convert every Queries.Delete*/Update* call site reachable from raw user
  input (autopilot, comment, project, skill, skill_file, label, pin,
  attachment, feedback, issue assignee, daemon runtime, workspace) to
  validate UUIDs explicitly with parseUUIDOrBadRequest, returning 400 on
  invalid input. Where a resolved entity.ID is already in scope, write
  queries now use it directly instead of re-parsing the URL string.
- Update getWorkspaceMember + loadIssueForUser to handle invalid UUIDs
  gracefully (404/400 instead of panic).
- Update util/middleware/cmd-level callers (subscriber_listeners,
  notification_listeners, activity_listeners, scope_authorizer,
  middleware/workspace) to use the error-returning API.
- Add server/internal/util/pgx_test.go covering valid/invalid input and
  the MustParseUUID panic contract.
- Add TestDeleteIssueByIdentifier + TestDeleteIssueRejectsInvalidUUID
  regression tests in handler_test.go (the original #1661 bug + the
  invalid-input case).
- Document the handler UUID parsing convention in CLAUDE.md so the rule
  is enforceable in future PR review.

* fix(server): address GPT-Boy review of #1748

P1 fixes from PR #1748 review:

1. Migrate remaining request-boundary UUIDs to parseUUIDOrBadRequest so
   malformed input returns 400 instead of panic/500. Was missing on:
   - issue.go: workspace_id in CreateIssue/ChildIssueProgress/ListIssues/
     SearchIssues/BatchUpdateIssues/BatchDeleteIssues; project_id /
     parent_issue_id / lead_id / assignee_id / assignee_ids / creator_id
     filters; batch issue_ids and assignee/parent/project fields in
     BatchUpdateIssues (skip on bad input via util.ParseUUID, matching
     the existing per-row continue semantics).
   - project.go: project id + workspace_id in GetProject/UpdateProject/
     DeleteProject; lead_id in CreateProject/UpdateProject;
     workspace_id in ListProjects + SearchProjects.
   - handler.go: resolveActor now uses util.ParseUUID for X-Agent-ID /
     X-Task-ID headers; invalid UUID falls back to "member" (matches
     pre-existing semantics) instead of panicking.
   - issue.go: validateAssigneePair returns 400 on invalid workspace_id
     instead of panicking.

2. Fix issue:deleted WS event payloads to emit uuidToString(issue.ID)
   instead of the raw URL string. After an identifier-path delete
   ("MUL-7"), the previous payload would have leaked the identifier to
   subscribers, leaving stale entries in frontend caches that key by
   UUID. Updated DeleteIssue (issue.go:1341) and BatchDeleteIssues
   (issue.go:1641). The slog "issue deleted" log line also now records
   the resolved UUID so logs match the WS payload.

3. Extend TestDeleteIssueByIdentifier to subscribe to the bus and
   assert issue:deleted.payload.issue_id is the resolved UUID, not
   the identifier.

* fix(server): validate remaining reviewed UUID inputs

* fix(server): validate remaining handler UUID inputs

* fix(server): finish request boundary UUID audit

* fix(server): validate remaining request body UUIDs

* fix(server): validate runtime path UUIDs

* fix(server): validate remaining audit UUID inputs

---------

Co-authored-by: Eve <eve@multica.ai>
2026-04-28 14:50:28 +08:00

266 lines
8.4 KiB
Go

package handler
import (
"encoding/json"
"log/slog"
"net/http"
"sync"
"time"
"github.com/go-chi/chi/v5"
)
// ---------------------------------------------------------------------------
// In-memory model-list request store
// ---------------------------------------------------------------------------
//
// The server cannot call the daemon directly (the daemon is behind the user's
// NAT and only polls the server). So "list models for this runtime" uses a
// pending-request pattern: server creates a pending request, daemon pops it
// on the next heartbeat, executes locally, and reports the result back.
// ModelListStatus represents the lifecycle of a model list request.
type ModelListStatus string
const (
ModelListPending ModelListStatus = "pending"
ModelListRunning ModelListStatus = "running"
ModelListCompleted ModelListStatus = "completed"
ModelListFailed ModelListStatus = "failed"
ModelListTimeout ModelListStatus = "timeout"
)
// ModelListRequest represents a pending or completed model list request.
// Supported is false when the provider ignores per-agent model
// selection entirely (currently: hermes). The UI uses this to
// disable its dropdown rather than silently accepting a value the
// backend will drop.
type ModelListRequest struct {
ID string `json:"id"`
RuntimeID string `json:"runtime_id"`
Status ModelListStatus `json:"status"`
Models []ModelEntry `json:"models,omitempty"`
Supported bool `json:"supported"`
Error string `json:"error,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// ModelEntry mirrors agent.Model for the wire. `Default` tags the
// model the runtime advertises as its preferred pick (e.g. Claude
// Code's shipped default, or hermes' currentModelId) so the UI can
// badge it — don't drop it when marshalling.
type ModelEntry struct {
ID string `json:"id"`
Label string `json:"label"`
Provider string `json:"provider,omitempty"`
Default bool `json:"default,omitempty"`
}
const (
// modelListPendingTimeout bounds how long a pending request can sit in
// the store before the UI is told "daemon didn't pick this up".
modelListPendingTimeout = 30 * time.Second
// modelListRunningTimeout bounds how long a claimed (running) request
// can stay claimed before the UI is told "daemon picked this up but
// never reported a result". This matters when the heartbeat response
// carrying `pending_model_list` is lost in transit (e.g. HTTP client
// timeout after PopPending already mutated store state): without this
// transition the UI would keep polling a record that is stuck in
// `running` until the 2-minute memory GC sweeps it.
modelListRunningTimeout = 60 * time.Second
)
// ModelListStore is a thread-safe in-memory store. Entries expire after 2 min
// to bound memory use; the UI polls /requests/:id until status is terminal.
type ModelListStore struct {
mu sync.Mutex
requests map[string]*ModelListRequest
}
func NewModelListStore() *ModelListStore {
return &ModelListStore{requests: make(map[string]*ModelListRequest)}
}
func (s *ModelListStore) Create(runtimeID string) *ModelListRequest {
s.mu.Lock()
defer s.mu.Unlock()
// Garbage-collect stale entries so the map can't grow unbounded.
for id, req := range s.requests {
if time.Since(req.CreatedAt) > 2*time.Minute {
delete(s.requests, id)
}
}
req := &ModelListRequest{
ID: randomID(),
RuntimeID: runtimeID,
Status: ModelListPending,
// Default to true; the daemon overrides this in the report
// for providers that don't support per-agent model selection.
Supported: true,
CreatedAt: time.Now(),
UpdatedAt: time.Now(),
}
s.requests[req.ID] = req
return req
}
func (s *ModelListStore) Get(id string) *ModelListRequest {
s.mu.Lock()
defer s.mu.Unlock()
req, ok := s.requests[id]
if !ok {
return nil
}
applyModelListTimeout(req, time.Now())
return req
}
// applyModelListTimeout transitions a request to ModelListTimeout when it has
// been stuck in a non-terminal state past its threshold. The pending threshold
// catches "daemon never picked this up"; the running threshold catches
// "daemon picked it up but the result report was lost" — previously the only
// escape from running was the 2-minute memory GC, which exceeded the UI's
// polling window and surfaced as a silent discovery failure.
func applyModelListTimeout(req *ModelListRequest, now time.Time) {
switch req.Status {
case ModelListPending:
if now.Sub(req.CreatedAt) > modelListPendingTimeout {
req.Status = ModelListTimeout
req.Error = "daemon did not respond within 30 seconds"
req.UpdatedAt = now
}
case ModelListRunning:
if now.Sub(req.UpdatedAt) > modelListRunningTimeout {
req.Status = ModelListTimeout
req.Error = "daemon did not finish within 60 seconds"
req.UpdatedAt = now
}
}
}
// PopPending returns and marks-running the oldest pending request for a runtime.
func (s *ModelListStore) PopPending(runtimeID string) *ModelListRequest {
s.mu.Lock()
defer s.mu.Unlock()
var oldest *ModelListRequest
for _, req := range s.requests {
if req.RuntimeID == runtimeID && req.Status == ModelListPending {
if oldest == nil || req.CreatedAt.Before(oldest.CreatedAt) {
oldest = req
}
}
}
if oldest != nil {
oldest.Status = ModelListRunning
oldest.UpdatedAt = time.Now()
}
return oldest
}
func (s *ModelListStore) Complete(id string, models []ModelEntry, supported bool) {
s.mu.Lock()
defer s.mu.Unlock()
if req, ok := s.requests[id]; ok {
req.Status = ModelListCompleted
req.Models = models
req.Supported = supported
req.UpdatedAt = time.Now()
}
}
func (s *ModelListStore) Fail(id string, errMsg string) {
s.mu.Lock()
defer s.mu.Unlock()
if req, ok := s.requests[id]; ok {
req.Status = ModelListFailed
req.Error = errMsg
req.UpdatedAt = time.Now()
}
}
// ---------------------------------------------------------------------------
// Handlers
// ---------------------------------------------------------------------------
// InitiateListModels creates a pending model list request for a runtime.
// Called by the frontend; the daemon picks it up on its next heartbeat.
func (h *Handler) InitiateListModels(w http.ResponseWriter, r *http.Request) {
runtimeID := chi.URLParam(r, "runtimeId")
runtimeUUID, ok := parseUUIDOrBadRequest(w, runtimeID, "runtime_id")
if !ok {
return
}
rt, err := h.Queries.GetAgentRuntime(r.Context(), runtimeUUID)
if err != nil {
writeError(w, http.StatusNotFound, "runtime not found")
return
}
if _, ok := h.requireWorkspaceMember(w, r, uuidToString(rt.WorkspaceID), "runtime not found"); !ok {
return
}
if rt.Status != "online" {
writeError(w, http.StatusServiceUnavailable, "runtime is offline")
return
}
req := h.ModelListStore.Create(uuidToString(rt.ID))
writeJSON(w, http.StatusOK, req)
}
// GetModelListRequest returns the status of a model list request.
func (h *Handler) GetModelListRequest(w http.ResponseWriter, r *http.Request) {
requestID := chi.URLParam(r, "requestId")
req := h.ModelListStore.Get(requestID)
if req == nil {
writeError(w, http.StatusNotFound, "request not found")
return
}
writeJSON(w, http.StatusOK, req)
}
// ReportModelListResult receives the list result from the daemon.
func (h *Handler) ReportModelListResult(w http.ResponseWriter, r *http.Request) {
runtimeID := chi.URLParam(r, "runtimeId")
if _, ok := h.requireDaemonRuntimeAccess(w, r, runtimeID); !ok {
return
}
requestID := chi.URLParam(r, "requestId")
var body struct {
Status string `json:"status"` // "completed" or "failed"
Models []ModelEntry `json:"models"`
Supported *bool `json:"supported"`
Error string `json:"error"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
writeError(w, http.StatusBadRequest, "invalid request body")
return
}
if body.Status == "completed" {
// Older daemons may omit `supported`; default to true to keep
// the UI usable while they haven't been redeployed yet.
supported := true
if body.Supported != nil {
supported = *body.Supported
}
h.ModelListStore.Complete(requestID, body.Models, supported)
} else {
h.ModelListStore.Fail(requestID, body.Error)
}
slog.Debug("model list report", "runtime_id", runtimeID, "request_id", requestID, "status", body.Status, "count", len(body.Models))
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
}