Files
multica/server/internal/handler/cloud_runtime_test.go
LinYushen c968c13c87 feat(auth): support mcn_ Cloud Node PATs verified via Fleet (#3349)
* feat(auth): support mcn_ Cloud Node PATs verified via Fleet

Adds a new token kind, mcn_ (multica cloud node), recognized in both
the regular Auth and DaemonAuth middlewares. mcn_ tokens are minted
and owned by Multica Cloud (not the local personal_access_tokens
table); the server validates them by POSTing to the Fleet's
/api/v1/pat/verify endpoint and uses the returned owner_id as
X-User-ID for downstream handlers.

Cloud is the authoritative owner of token status, so this is a
verifier-only path with no DB fallback:

  * Fleet says valid:false -> 401 (token genuinely bad)
  * Fleet unreachable / 5xx -> 503 (transient, retry)
  * No MULTICA_CLOUD_FLEET_URL configured -> 401 (fail closed)

Verification results are cached in Redis for 60s under
mul:auth:mcn:<sha256> to bound the per-request load on Fleet without
extending the revocation window beyond what the Cloud doc allows.
Negative results are NOT cached, so a freshly minted token doesn't
get locked out by a stale 'token_not_found'.

Reuses MULTICA_CLOUD_FLEET_URL (the same env the cloud-runtime proxy
already uses) so deployments don't need a second config knob.

Tests cover the happy path, every documented invalid reason, 4xx/5xx
mapping, network error, decode error, ctx cancellation, the
fail-closed valid:true-without-owner_id case, trailing-slash URL
normalization, and the Redis cache short-circuit + negative
no-cache contract. Middleware tests pin the four 401/503/200 outcomes
in both Auth and DaemonAuth.

* auth(mcn): require owner_id to map to a real local user; drop X-User-PAT plumbing

Two related changes:

1. Cloud-verified owner_id is now checked against our local users table.
   The Cloud owner_id and our users.id share the same UUID space by
   contract; a missing local user means either the row was deleted
   under an active node or something is forging owner_ids — either
   way, fail closed.

   CloudPATVerifier.Verify takes a new OwnerLookupFunc:
     - returns (true, nil)   -> success, cache + return
     - returns (false, nil)  -> ErrCloudPATInvalid (reason='owner_unknown'),
                                NOT cached (so a freshly-created user
                                doesn't get locked out for a TTL window)
     - returns (_, error)    -> ErrCloudPATUnavailable (transient,
                                middleware emits 503)

   Both Auth and DaemonAuth wire ownerLookupFor(queries), a new shared
   helper that wraps queries.GetUser, mapping pgx.ErrNoRows / unparseable
   UUIDs to (false, nil) and other errors to a real Go error.

2. Removed all X-User-PAT plumbing. Cloud now mints node-scoped mcn_
   PATs itself during /api/v1/nodes (see multica-cloud
   docs/api/node-pat.md) and ships them into the EC2 instance via SSM,
   so multica-api no longer needs to forward the caller's mul_ PAT.
   Propagating a long-lived user PAT into a remote machine widened
   the blast radius of any node compromise; that's gone now.

   Removed:
     - cloud_runtime.go: withUserPAT option, cloudRuntimeUserPAT,
       generateCloudRuntimePAT, revokeGeneratedPAT
     - cloudruntime/Request.UserPAT field + X-User-PAT header
     - X-User-PAT from CORS allowed headers
     - obsolete handler tests:
         TestCreateCloudRuntimeNodeForwardsValidatedPAT
         TestCreateCloudRuntimeNodeRejectsUnownedPAT
         TestCreateCloudRuntimeNodeRejectsExpiredPAT
         TestCreateCloudRuntimeNodeAutoGeneratesPAT
       replaced with TestCreateCloudRuntimeNodeForwardsBody
     - X-User-PAT references in packages/core/api/client.test.ts

Tests:
  * 3 new verifier-level tests (owner_unknown not cached, lookup error
    -> Unavailable, success path is cached for both fleet AND lookup)
  * 5 new owner_lookup_test.go tests (nil queries, existing user,
    missing user, malformed UUID, DB error)
  * 1 new end-to-end DaemonAuth test (cloud says valid, no local user
    -> 401)
  * Existing X-User-PAT TS assertions removed; full vitest run passes.
  * go test ./... and go vet ./... clean on the server module.
2026-05-27 14:52:03 +08:00

204 lines
5.6 KiB
Go

package handler
import (
"bytes"
"context"
"net/http"
"net/http/httptest"
"strings"
"testing"
"github.com/multica-ai/multica/server/internal/cloudruntime"
)
type fakeCloudRuntimeProxy struct {
enabled bool
req cloudruntime.Request
resp *cloudruntime.Response
err error
called bool
}
func (f *fakeCloudRuntimeProxy) Enabled() bool {
return f.enabled
}
func (f *fakeCloudRuntimeProxy) Do(ctx context.Context, req cloudruntime.Request) (*cloudruntime.Response, error) {
f.called = true
f.req = req
if f.err != nil {
return nil, f.err
}
return f.resp, nil
}
func useCloudRuntimeProxy(t *testing.T, proxy cloudRuntimeProxy) {
t.Helper()
prevProxy := testHandler.CloudRuntime
testHandler.CloudRuntime = proxy
t.Cleanup(func() { testHandler.CloudRuntime = prevProxy })
}
// TestCreateCloudRuntimeNodeForwardsBody is the post-MUL-2671 happy
// path for CreateCloudRuntimeNode: the handler no longer reads, asks
// for, or auto-generates an mul_ PAT — Cloud now mints its own
// node-scoped mcn_ PAT during /api/v1/nodes and ships it to the EC2
// instance via SSM. Multica-api just forwards the request body and
// the caller's user_id; there is no PAT plumbing on this endpoint.
func TestCreateCloudRuntimeNodeForwardsBody(t *testing.T) {
proxy := &fakeCloudRuntimeProxy{
enabled: true,
resp: &cloudruntime.Response{
StatusCode: http.StatusCreated,
Header: http.Header{"X-Request-Id": []string{"fleet-request-id"}},
Body: []byte(`{"status":"launching"}`),
},
}
useCloudRuntimeProxy(t, proxy)
req := newRequest(http.MethodPost, "/api/cloud-runtime/nodes", map[string]any{
"instance_type": "g5.xlarge",
})
req.Header.Set("X-Request-ID", "api-request-id")
w := httptest.NewRecorder()
testHandler.CreateCloudRuntimeNode(w, req)
if w.Code != http.StatusCreated {
t.Fatalf("status = %d, body = %s", w.Code, w.Body.String())
}
if !proxy.called {
t.Fatal("cloud runtime proxy was not called")
}
if proxy.req.Method != http.MethodPost || proxy.req.Path != "/api/v1/nodes" {
t.Fatalf("proxied request = %s %s", proxy.req.Method, proxy.req.Path)
}
if proxy.req.UserID != testUserID {
t.Fatalf("proxied user id = %q", proxy.req.UserID)
}
if proxy.req.RequestID != "api-request-id" {
t.Fatalf("proxied request id = %q", proxy.req.RequestID)
}
if got := w.Header().Get("X-Request-ID"); got != "fleet-request-id" {
t.Fatalf("response request id = %q", got)
}
}
func TestCloudRuntimeDisabledReturnsUnavailable(t *testing.T) {
useCloudRuntimeProxy(t, &fakeCloudRuntimeProxy{enabled: false})
req := newRequest(http.MethodGet, "/api/cloud-runtime/nodes", nil)
w := httptest.NewRecorder()
testHandler.ListCloudRuntimeNodes(w, req)
if w.Code != http.StatusServiceUnavailable {
t.Fatalf("status = %d, body = %s", w.Code, w.Body.String())
}
}
func TestListCloudRuntimeNodesForwardsQuery(t *testing.T) {
proxy := &fakeCloudRuntimeProxy{
enabled: true,
resp: &cloudruntime.Response{
StatusCode: http.StatusOK,
Body: []byte(`[]`),
},
}
useCloudRuntimeProxy(t, proxy)
req := newRequest(http.MethodGet, "/api/cloud-runtime/nodes?limit=10&offset=20", nil)
w := httptest.NewRecorder()
testHandler.ListCloudRuntimeNodes(w, req)
if w.Code != http.StatusOK {
t.Fatalf("status = %d, body = %s", w.Code, w.Body.String())
}
if !proxy.called {
t.Fatal("cloud runtime proxy was not called")
}
if proxy.req.Method != http.MethodGet || proxy.req.Path != "/api/v1/nodes" {
t.Fatalf("proxied request = %s %s", proxy.req.Method, proxy.req.Path)
}
if got := proxy.req.Query.Encode(); got != "limit=10&offset=20" {
t.Fatalf("proxied query = %q", got)
}
}
func TestCloudRuntimeNonJSONResponseIsWrapped(t *testing.T) {
proxy := &fakeCloudRuntimeProxy{
enabled: true,
resp: &cloudruntime.Response{
StatusCode: http.StatusBadGateway,
Body: []byte("fleet failed\n"),
},
}
useCloudRuntimeProxy(t, proxy)
req := newRequest(http.MethodGet, "/api/cloud-runtime/healthz", nil)
w := httptest.NewRecorder()
testHandler.GetCloudRuntimeHealth(w, req)
if w.Code != http.StatusBadGateway {
t.Fatalf("status = %d, body = %s", w.Code, w.Body.String())
}
if ct := w.Header().Get("Content-Type"); ct != "application/json" {
t.Fatalf("content type = %q", ct)
}
if got := w.Body.String(); !strings.Contains(got, `"error":"fleet failed"`) {
t.Fatalf("body = %s", got)
}
}
func TestCloudRuntimeEmptyResponseKeepsStatus(t *testing.T) {
proxy := &fakeCloudRuntimeProxy{
enabled: true,
resp: &cloudruntime.Response{
StatusCode: http.StatusNoContent,
Body: nil,
},
}
useCloudRuntimeProxy(t, proxy)
req := newRequest(http.MethodGet, "/api/cloud-runtime/healthz", nil)
w := httptest.NewRecorder()
testHandler.GetCloudRuntimeHealth(w, req)
if w.Code != http.StatusNoContent {
t.Fatalf("status = %d, body = %s", w.Code, w.Body.String())
}
if body := w.Body.String(); body != "" {
t.Fatalf("body = %s", body)
}
}
func TestCreateCloudRuntimeNodeRejectsLargeBody(t *testing.T) {
proxy := &fakeCloudRuntimeProxy{
enabled: true,
resp: &cloudruntime.Response{
StatusCode: http.StatusCreated,
Body: []byte(`{"status":"launching"}`),
},
}
useCloudRuntimeProxy(t, proxy)
body := bytes.NewReader(bytes.Repeat([]byte("a"), maxCloudRuntimeRequestBodySize+1))
req := httptest.NewRequest(http.MethodPost, "/api/cloud-runtime/nodes", body)
req.Header.Set("Content-Type", "application/json")
req.Header.Set("X-User-ID", testUserID)
w := httptest.NewRecorder()
testHandler.CreateCloudRuntimeNode(w, req)
if w.Code != http.StatusRequestEntityTooLarge {
t.Fatalf("status = %d, body = %s", w.Code, w.Body.String())
}
if proxy.called {
t.Fatal("cloud runtime proxy should not be called")
}
}