Compare commits

...

1 Commits

Author SHA1 Message Date
J
2688f18aa4 fix(daemon): bound runtime --version probe so one wedged CLI can't stall startup
A Homebrew-installed claude whose bun shim never returns makes
`claude --version` hang forever. Runtime registration probes every
configured agent sequentially on the daemon startup critical path with no
timeout, so that one wedged probe blocks the loop, no runtime ever
registers, the daemon never flips /health from "starting" to "running",
and the desktop is stuck on "starting" with every runtime offline.

Bound each probe with a 10s timeout plus a WaitDelay that force-closes the
stdout pipe (a wedged claude can fork a bun child that inherits and holds
the pipe open, so killing claude alone would not unblock cmd.Output()). A
timed-out probe now returns an error, the loop skips just the broken
runtime, and every healthy runtime still comes online.

MUL-3812

Co-authored-by: multica-agent <github@multica.ai>
2026-06-29 01:12:29 +08:00
2 changed files with 87 additions and 0 deletions

View File

@@ -804,8 +804,31 @@ func writeMcpConfigToTemp(raw json.RawMessage) (string, error) {
return f.Name(), nil
}
// versionDetectTimeout bounds how long a single `<cli> --version` probe may
// run. A healthy CLI answers in well under a second; a much longer wait means
// the binary is wedged — e.g. a Homebrew-installed `claude` whose bun shim
// never returns (MUL-3812). The bound matters because runtime registration
// probes every configured agent sequentially (registerRuntimesForWorkspace),
// and that loop runs on the daemon's startup critical path before /health
// flips from "starting" to "running". Without a bound, one wedged CLI blocks
// the loop forever: no other runtime gets registered, the daemon never reports
// ready, and the desktop is stuck on "starting". With the bound, the wedged
// probe returns a deadline error, the loop skips just that runtime, and every
// healthy runtime still comes online.
//
// A var (not const) so tests can shorten it; nothing else mutates it.
var versionDetectTimeout = 10 * time.Second
func detectCLIVersion(ctx context.Context, execPath string) (string, error) {
ctx, cancel := context.WithTimeout(ctx, versionDetectTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, execPath, "--version")
// A wedged CLI may fork a child (e.g. the bun runtime behind `claude`) that
// inherits and keeps the stdout pipe open. Killing the CLI on context
// cancellation would not unblock cmd.Output(), which waits for that pipe to
// close. WaitDelay force-closes the pipes shortly after the context fires so
// the probe always returns instead of hanging on the surviving grandchild.
cmd.WaitDelay = 2 * time.Second
hideAgentWindow(cmd)
data, err := cmd.Output()
if err != nil {

View File

@@ -1,8 +1,13 @@
package agent
import (
"context"
"errors"
"os"
"path/filepath"
"runtime"
"testing"
"time"
)
func TestParseSemver(t *testing.T) {
@@ -138,6 +143,65 @@ func TestExtractVersionLine(t *testing.T) {
}
}
func TestDetectCLIVersionHealthy(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("shell-script fixture is POSIX-only")
}
script := filepath.Join(t.TempDir(), "fakecli")
if err := os.WriteFile(script, []byte("#!/bin/sh\necho '2.1.5 (Claude Code)'\n"), 0o755); err != nil {
t.Fatal(err)
}
got, err := detectCLIVersion(context.Background(), script)
if err != nil {
t.Fatalf("detectCLIVersion() error = %v", err)
}
if want := "2.1.5 (Claude Code)"; got != want {
t.Errorf("detectCLIVersion() = %q, want %q", got, want)
}
}
// TestDetectCLIVersionTimesOutOnWedgedCLI reproduces MUL-3812: a `--version`
// probe that never returns must not block the caller. Even with an unbounded
// parent context (which is what registerRuntimesForWorkspace passes), the probe
// bounds itself and returns an error, so the sequential runtime-registration
// loop can skip the wedged CLI and bring every healthy runtime online instead
// of leaving the desktop stuck on "starting".
func TestDetectCLIVersionTimesOutOnWedgedCLI(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("shell-script fixture is POSIX-only")
}
script := filepath.Join(t.TempDir(), "wedgedcli")
// Sleeps far longer than the probe timeout and never prints a version —
// models a Homebrew/bun `claude` whose `--version` hangs. The backgrounded
// `sleep` also inherits the stdout pipe and outlives a kill of the shell,
// exercising the WaitDelay path that force-closes the pipe.
if err := os.WriteFile(script, []byte("#!/bin/sh\nsleep 60\n"), 0o755); err != nil {
t.Fatal(err)
}
orig := versionDetectTimeout
versionDetectTimeout = 200 * time.Millisecond
defer func() { versionDetectTimeout = orig }()
done := make(chan error, 1)
start := time.Now()
go func() {
_, err := detectCLIVersion(context.Background(), script)
done <- err
}()
select {
case err := <-done:
if err == nil {
t.Fatal("detectCLIVersion() returned nil for a wedged CLI; want a timeout error")
}
if elapsed := time.Since(start); elapsed > 10*time.Second {
t.Errorf("detectCLIVersion() took %v; expected to bound near versionDetectTimeout", elapsed)
}
case <-time.After(30 * time.Second):
t.Fatal("detectCLIVersion() did not return; a wedged CLI blocked the probe (regression of MUL-3812)")
}
}
func TestCheckMinVersion(t *testing.T) {
tests := []struct {
agentType string