mirror of
https://github.com/multica-ai/multica.git
synced 2026-07-05 21:39:54 +02:00
* fix(server): persist ModelListStore across replicas via Redis The model picker uses a pending-request pattern: the frontend POSTs to create a request, the daemon pops it on its next heartbeat, runs agent.ListModels locally, and reports back. Until now the store was a plain in-memory map per Handler instance. That works for self-hosted single-instance deploys but fails in any multi-replica environment (Multica Cloud). Each replica has its own map, so: POST /runtimes/:id/models → request stored in replica A GET /runtimes/:id/models/<requestId> → polls land on B/C → 404 daemon heartbeat → only A sees PendingModelList POST .../<requestId>/result → daemon's report has to land on A Success probability ~1/N². The visible symptom is "No models available" in the picker for every provider, even those (Claude/Codex) whose catalog is statically populated end-to-end. Same shape of bug, same Redis-backed fix as multica-ai/multica#1557 did for LocalSkillListStore / LocalSkillImportStore. Reuse the operational playbook (namespaced keys, ZSET-backed pending queue, atomic ZREM+SET-running via the shared Lua script) so we don't introduce a second concurrency model for the same primitive. Changes: - Convert ModelListStore from struct to interface with context-aware methods. Add HasPending for cheap heartbeat-side probing. - InMemoryModelListStore — single-node fallback, used when REDIS_URL is unset (self-hosted dev / tests). - RedisModelListStore — multi-node implementation using the same key layout and Lua atomic claim as RedisLocalSkillListStore. - Use RunStartedAt (not UpdatedAt) as the running-timeout reference point, matching the local-skill stores so subsequent UpdatedAt bumps don't reset the running clock. - Heartbeat now uses the probe-then-pop pattern for the model queue (matching local-skills) so a slow Redis can't stall every connected daemon. Extends heartbeatMetrics + slow-log with probe_model_ms / pop_model_ms / probe_model_timed_out for parity. - Wire the Redis backend in NewRouterWithOptions when rdb != nil. - Tests for both backends. Redis tests gate on REDIS_TEST_URL so laptop runs without Redis still pass; CI provides it. Co-authored-by: multica-agent <github@multica.ai> * fix(server): persist RunStartedAt + retry model report on transient failures Two follow-ups from PR #2022 review: 1. RedisModelListStore was dropping ModelListRequest.RunStartedAt on persistence — the field is tagged json:"-" so it doesn't leak into the HTTP response, which made plain json.Marshal(req) silently discard it. Across-node readers saw RunStartedAt=nil and applyModelListTimeout's running branch became a no-op, so the 60s running-timeout escape hatch never fired. CI's TestRedisModelListStore_RunningTimeout was failing on this exact case. Fix mirrors RedisLocalSkillImportStore's envelope pattern — wrap in an internal struct that re-promotes the field. HTTP shape stays clean. Adds a no-Redis unit test that pins the round trip. 2. Daemon's handleModelList called d.client.ReportModelListResult directly and swallowed any 5xx, leaving the pending request stranded in "running" until its 60s server-side timeout — exactly the failure mode the multi-node store fix was meant to eliminate. Generalize the existing local-skill retry helper into reportRuntimeResultWithRetry (kind: model_list / local_skill_list / local_skill_import) and wire handleModelList through a new reportModelListResult helper. Renames the test-overridable var localSkillReportBackoffs → runtimeReportBackoffs to match. Co-authored-by: multica-agent <github@multica.ai> --------- Co-authored-by: multica-agent <github@multica.ai>
199 lines
6.7 KiB
Go
199 lines
6.7 KiB
Go
package handler
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
// TestModelListStore_RunningRequestTimesOut pins the escape hatch for
|
|
// requests that were claimed (PopPending → Running) but whose result was
|
|
// never reported — usually because the heartbeat response carrying the
|
|
// `pending_model_list` field was lost in transit. Before this, the only
|
|
// way out of Running was the 2-minute memory GC, which exceeded the UI
|
|
// polling window and surfaced as a silent "discovery failed" (MUL-1397).
|
|
func TestModelListStore_RunningRequestTimesOut(t *testing.T) {
|
|
ctx := context.Background()
|
|
store := NewInMemoryModelListStore()
|
|
req, err := store.Create(ctx, "runtime-xyz")
|
|
if err != nil {
|
|
t.Fatalf("create: %v", err)
|
|
}
|
|
claimed, err := store.PopPending(ctx, "runtime-xyz")
|
|
if err != nil {
|
|
t.Fatalf("pop: %v", err)
|
|
}
|
|
if claimed == nil {
|
|
t.Fatal("expected PopPending to claim the pending request")
|
|
}
|
|
if claimed.Status != ModelListRunning {
|
|
t.Fatalf("expected Running after PopPending, got %s", claimed.Status)
|
|
}
|
|
if claimed.RunStartedAt == nil {
|
|
t.Fatal("expected RunStartedAt to be set on PopPending")
|
|
}
|
|
|
|
// Age the running record past the threshold without the daemon ever
|
|
// reporting a result. Get() must flip it to Timeout so the UI can
|
|
// terminate polling instead of waiting for the retention sweep.
|
|
aged := time.Now().Add(-(modelListRunningTimeout + time.Second))
|
|
claimed.RunStartedAt = &aged
|
|
got, err := store.Get(ctx, req.ID)
|
|
if err != nil {
|
|
t.Fatalf("get: %v", err)
|
|
}
|
|
if got == nil {
|
|
t.Fatal("expected stored request")
|
|
}
|
|
if got.Status != ModelListTimeout {
|
|
t.Fatalf("expected Timeout after running threshold, got %s", got.Status)
|
|
}
|
|
if got.Error == "" {
|
|
t.Fatal("expected timeout error message")
|
|
}
|
|
}
|
|
|
|
// TestReportModelListResult_PreservesDefault guards the daemon → server
|
|
// → UI wire format for the model-discovery result. The `default` bool
|
|
// on each ModelEntry lights up the UI's "default" badge; if it gets
|
|
// dropped here (e.g. by going through a map[string]string), the badge
|
|
// silently disappears.
|
|
func TestReportModelListResult_PreservesDefault(t *testing.T) {
|
|
ctx := context.Background()
|
|
store := NewInMemoryModelListStore()
|
|
req, err := store.Create(ctx, "runtime-xyz")
|
|
if err != nil {
|
|
t.Fatalf("create: %v", err)
|
|
}
|
|
|
|
// Report a completed result with one default entry and one not.
|
|
body := map[string]any{
|
|
"status": "completed",
|
|
"supported": true,
|
|
"models": []map[string]any{
|
|
{"id": "foo-default", "label": "Foo", "provider": "p", "default": true},
|
|
{"id": "bar", "label": "Bar", "provider": "p"},
|
|
},
|
|
}
|
|
raw, _ := json.Marshal(body)
|
|
|
|
// Use the store's Complete directly — we're verifying the wire
|
|
// shape, not HTTP auth. The handler itself unmarshals into
|
|
// []ModelEntry and forwards verbatim, which is the path we care
|
|
// about here.
|
|
var parsed struct {
|
|
Models []ModelEntry `json:"models"`
|
|
}
|
|
if err := json.Unmarshal(raw, &parsed); err != nil {
|
|
t.Fatalf("unmarshal report body: %v", err)
|
|
}
|
|
if err := store.Complete(ctx, req.ID, parsed.Models, true); err != nil {
|
|
t.Fatalf("complete: %v", err)
|
|
}
|
|
|
|
got, err := store.Get(ctx, req.ID)
|
|
if err != nil {
|
|
t.Fatalf("get: %v", err)
|
|
}
|
|
if got == nil {
|
|
t.Fatal("expected stored result")
|
|
}
|
|
if len(got.Models) != 2 {
|
|
t.Fatalf("expected 2 models, got %d: %+v", len(got.Models), got.Models)
|
|
}
|
|
if !got.Models[0].Default {
|
|
t.Errorf("first model should carry Default=true, got %+v", got.Models[0])
|
|
}
|
|
if got.Models[1].Default {
|
|
t.Errorf("second model should carry Default=false, got %+v", got.Models[1])
|
|
}
|
|
|
|
// Serialise the stored request back out (what UI actually sees)
|
|
// and confirm `default: true` survives.
|
|
out, _ := json.Marshal(got)
|
|
if !bytes.Contains(out, []byte(`"default":true`)) {
|
|
t.Errorf(`expected "default":true in JSON response, got: %s`, out)
|
|
}
|
|
}
|
|
|
|
// TestReportModelListResult_DecodesJSONBodyDefault verifies the
|
|
// handler's request-body parsing accepts the `default` bool from
|
|
// the daemon POST — not just through the store API.
|
|
func TestReportModelListResult_DecodesJSONBodyDefault(t *testing.T) {
|
|
// Simulate the shape the daemon POSTs: status + models + supported
|
|
// with `default` on one entry.
|
|
payload := `{"status":"completed","supported":true,"models":[{"id":"a","label":"A","default":true},{"id":"b","label":"B"}]}`
|
|
r := httptest.NewRequest(http.MethodPost, "/api/daemon/runtimes/rt/models/req/result", bytes.NewBufferString(payload))
|
|
|
|
var body struct {
|
|
Status string `json:"status"`
|
|
Models []ModelEntry `json:"models"`
|
|
Supported *bool `json:"supported"`
|
|
}
|
|
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
|
t.Fatalf("decode: %v", err)
|
|
}
|
|
if len(body.Models) != 2 {
|
|
t.Fatalf("want 2 models, got %d", len(body.Models))
|
|
}
|
|
if !body.Models[0].Default {
|
|
t.Errorf("default flag lost on model[0]: %+v", body.Models[0])
|
|
}
|
|
}
|
|
|
|
// TestInMemoryModelListStore_HasPending pins the cheap probe used by the
|
|
// heartbeat hot path. Empty queue → false; pending request → true; after
|
|
// PopPending claims the record → false again.
|
|
func TestInMemoryModelListStore_HasPending(t *testing.T) {
|
|
ctx := context.Background()
|
|
store := NewInMemoryModelListStore()
|
|
|
|
if has, err := store.HasPending(ctx, "rt-1"); err != nil || has {
|
|
t.Fatalf("empty store should not report pending: has=%v err=%v", has, err)
|
|
}
|
|
|
|
if _, err := store.Create(ctx, "rt-1"); err != nil {
|
|
t.Fatalf("create: %v", err)
|
|
}
|
|
if has, err := store.HasPending(ctx, "rt-1"); err != nil || !has {
|
|
t.Fatalf("expected pending=true after Create: has=%v err=%v", has, err)
|
|
}
|
|
// Other runtimes don't see this runtime's queue.
|
|
if has, err := store.HasPending(ctx, "rt-2"); err != nil || has {
|
|
t.Fatalf("expected pending=false for unrelated runtime: has=%v err=%v", has, err)
|
|
}
|
|
|
|
if _, err := store.PopPending(ctx, "rt-1"); err != nil {
|
|
t.Fatalf("pop: %v", err)
|
|
}
|
|
if has, err := store.HasPending(ctx, "rt-1"); err != nil || has {
|
|
t.Fatalf("expected pending=false after PopPending: has=%v err=%v", has, err)
|
|
}
|
|
}
|
|
|
|
// TestInMemoryModelListStore_PopPendingPicksOldest documents the FIFO
|
|
// ordering so a daemon that handles one request per heartbeat doesn't
|
|
// starve early queue entries.
|
|
func TestInMemoryModelListStore_PopPendingPicksOldest(t *testing.T) {
|
|
ctx := context.Background()
|
|
store := NewInMemoryModelListStore()
|
|
|
|
first, _ := store.Create(ctx, "rt-1")
|
|
// Force a measurable gap so the FIFO comparison isn't on equal
|
|
// CreatedAt values (possible on platforms with coarse clocks).
|
|
time.Sleep(2 * time.Millisecond)
|
|
second, _ := store.Create(ctx, "rt-1")
|
|
|
|
got, err := store.PopPending(ctx, "rt-1")
|
|
if err != nil {
|
|
t.Fatalf("pop: %v", err)
|
|
}
|
|
if got == nil || got.ID != first.ID {
|
|
t.Fatalf("expected first request, got %+v (second was %s)", got, second.ID)
|
|
}
|
|
}
|