Compare commits

...

1 Commits

Author SHA1 Message Date
Jiang Bohan
9a43fcf87e fix(daemon): migrate existing .local daemon_ids and normalize on register
PR #1070 stripped `.local` from the daemon-side hostname so CLI and
desktop daemons stop registering as separate devices on macOS, but it
left two gaps:

1. Existing rows in `agent_runtime` for every macOS user still carry
   the `.local` suffix. After upgrade the daemon registers under the
   new canonical `daemon_id`, hits the `(workspace_id, daemon_id,
   provider)` unique key as a miss, and INSERTs a fresh row. The old
   `agent.runtime_id` FK keeps pointing at the orphaned `.local` row,
   which never receives a heartbeat again — the user's agent appears
   offline until they manually rebind it.
2. Older or non-CLI clients (anyone calling /api/daemon/register
   directly) can still send the suffixed form and create the same
   orphan condition going forward.

Fixes:

- Migration 048 walks `agent_runtime` and, for every (workspace_id,
  provider) where both `X` and `X.local` rows exist, redirects the
  `agent` and `agent_task_queue` FK references from the `.local` row
  to the canonical row inside a single statement (so the RESTRICT
  constraint passes), then deletes the duplicate. Orphaned `.local`
  rows with no canonical counterpart are renamed in place.
- Handler-side `normalizeDaemonID` strips the suffix on every
  /api/daemon/register call before the upsert, so stale clients can't
  re-create the orphan.

Tests cover the normalization helper directly and exercise the
register endpoint twice — once with `.local` and once canonical — to
prove both forms upsert into the same row.

Refs: MUL-971
2026-04-17 00:32:24 +08:00
4 changed files with 150 additions and 1 deletions

View File

@@ -135,6 +135,14 @@ type daemonWorkspaceReposResponse struct {
ReposVersion string `json:"repos_version"`
}
// normalizeDaemonID strips the trailing `.local` mDNS suffix that macOS
// hostnames sometimes carry. The same normalization happens on the daemon
// side (see server/internal/daemon/config.go), but defending again here
// protects against pre-fix CLI versions and any non-CLI caller.
func normalizeDaemonID(id string) string {
return strings.TrimSuffix(id, ".local")
}
func normalizeWorkspaceRepos(repos []RepoData) []RepoData {
if len(repos) == 0 {
return []RepoData{}
@@ -201,7 +209,7 @@ func (h *Handler) DaemonRegister(w http.ResponseWriter, r *http.Request) {
}
req.WorkspaceID = strings.TrimSpace(req.WorkspaceID)
req.DaemonID = strings.TrimSpace(req.DaemonID)
req.DaemonID = normalizeDaemonID(strings.TrimSpace(req.DaemonID))
req.DeviceName = strings.TrimSpace(req.DeviceName)
if req.DaemonID == "" {

View File

@@ -644,3 +644,81 @@ func TestGetDaemonWorkspaceRepos_VersionIgnoresOrderAndDescription(t *testing.T)
t.Fatalf("expected repos_version to change when URL set changes, got %s", version3)
}
}
func TestNormalizeDaemonID(t *testing.T) {
cases := []struct {
in, want string
}{
{"MacBook-Air.local", "MacBook-Air"},
{"MacBook-Air", "MacBook-Air"},
{"some.hostname.local", "some.hostname"},
{"local-machine", "local-machine"},
{".local", ""},
{"", ""},
}
for _, c := range cases {
if got := normalizeDaemonID(c.in); got != c.want {
t.Errorf("normalizeDaemonID(%q) = %q, want %q", c.in, got, c.want)
}
}
}
func TestDaemonRegister_NormalizesLocalSuffix(t *testing.T) {
if testHandler == nil {
t.Skip("database not available")
}
const daemonRaw = "test-mac.local"
const daemonNorm = "test-mac"
// First register with the .local suffix — server should strip it before
// upserting, so the row lands under daemonNorm.
w := httptest.NewRecorder()
req := newDaemonTokenRequest("POST", "/api/daemon/register", map[string]any{
"workspace_id": testWorkspaceID,
"daemon_id": daemonRaw,
"device_name": "test-mac",
"runtimes": []map[string]any{
{"name": "claude-norm", "type": "claude", "version": "1.0.0", "status": "online"},
},
}, testWorkspaceID, daemonRaw)
testHandler.DaemonRegister(w, req)
if w.Code != http.StatusOK {
t.Fatalf("first register: expected 200, got %d: %s", w.Code, w.Body.String())
}
var resp map[string]any
json.NewDecoder(w.Body).Decode(&resp)
firstID := resp["runtimes"].([]any)[0].(map[string]any)["id"].(string)
defer testPool.Exec(context.Background(), `DELETE FROM agent_runtime WHERE daemon_id IN ($1, $2)`, daemonRaw, daemonNorm)
var stored string
if err := testPool.QueryRow(context.Background(),
`SELECT daemon_id FROM agent_runtime WHERE id = $1`, firstID).Scan(&stored); err != nil {
t.Fatalf("read first runtime row: %v", err)
}
if stored != daemonNorm {
t.Fatalf("first register: expected stored daemon_id %q, got %q", daemonNorm, stored)
}
// Second register with the canonical (no-suffix) form must hit the same
// row via the (workspace_id, daemon_id, provider) upsert key — proving
// the .local-form caller and the canonical-form caller share an identity.
w = httptest.NewRecorder()
req = newDaemonTokenRequest("POST", "/api/daemon/register", map[string]any{
"workspace_id": testWorkspaceID,
"daemon_id": daemonNorm,
"device_name": "test-mac",
"runtimes": []map[string]any{
{"name": "claude-norm", "type": "claude", "version": "1.0.0", "status": "online"},
},
}, testWorkspaceID, daemonNorm)
testHandler.DaemonRegister(w, req)
if w.Code != http.StatusOK {
t.Fatalf("second register: expected 200, got %d: %s", w.Code, w.Body.String())
}
json.NewDecoder(w.Body).Decode(&resp)
secondID := resp["runtimes"].([]any)[0].(map[string]any)["id"].(string)
if secondID != firstID {
t.Fatalf("second register: expected same runtime id %q (upsert), got %q (insert) — .local suffix not normalized", firstID, secondID)
}
}

View File

@@ -0,0 +1,5 @@
-- Cannot reliably restore the `.local` suffix: the migration discarded
-- which rows originally had it, and post-merge data may legitimately
-- belong to a daemon that always reported `X` (not `X.local`). The only
-- correct rollback path is to also revert PR #1070, after which existing
-- daemons will start re-registering under their original suffixed names.

View File

@@ -0,0 +1,58 @@
-- Normalize daemon_id by stripping the trailing `.local` mDNS suffix.
--
-- Daemons started via different methods on macOS used to register with
-- inconsistent hostnames: standalone CLI got `MacBook-Air` while the
-- desktop-bundled binary got `MacBook-Air.local` (or vice versa). PR #1070
-- (commit 6428a100) fixed the daemon side by stripping `.local` at hostname
-- resolution time, but did not address existing rows.
--
-- Without this migration, every macOS user upgrading past 6428a100 will
-- have all of their `agent_runtime` rows inserted again under the new
-- canonical `daemon_id`, leaving the old rows orphaned and the agents
-- (which reference `agent_runtime.id` via FK) pointing at runtimes that
-- no longer receive heartbeats.
--
-- Strategy:
-- 1. For every (workspace_id, provider) where both `X` and `X.local`
-- exist, keep `X` as the canonical row and redirect both
-- `agent.runtime_id` and `agent_task_queue.runtime_id` from the
-- `.local` row to the canonical row, then delete the duplicate.
-- 2. For any remaining rows that still end in `.local` (no canonical
-- counterpart), strip the suffix in place.
--
-- Note: `TRIM(TRAILING '.local' FROM ...)` is unsafe because TRIM treats
-- its argument as a character set, not a substring; we use a substring
-- expression on the LIKE-matched rows instead.
WITH pairs AS (
SELECT
canonical.id AS keep_id,
dot_local.id AS dup_id
FROM agent_runtime canonical
INNER JOIN agent_runtime dot_local
ON canonical.workspace_id = dot_local.workspace_id
AND canonical.provider = dot_local.provider
AND dot_local.daemon_id = canonical.daemon_id || '.local'
),
agent_redirect AS (
UPDATE agent
SET runtime_id = pairs.keep_id
FROM pairs
WHERE agent.runtime_id = pairs.dup_id
RETURNING agent.id
),
queue_redirect AS (
UPDATE agent_task_queue
SET runtime_id = pairs.keep_id
FROM pairs
WHERE agent_task_queue.runtime_id = pairs.dup_id
RETURNING agent_task_queue.id
)
DELETE FROM agent_runtime
WHERE id IN (SELECT dup_id FROM pairs);
UPDATE agent_runtime
SET
daemon_id = substring(daemon_id from 1 for length(daemon_id) - length('.local')),
updated_at = now()
WHERE daemon_id LIKE '%.local';