diff --git a/CLI_AND_DAEMON.md b/CLI_AND_DAEMON.md index 8ed61a4b0..f5f96f801 100644 --- a/CLI_AND_DAEMON.md +++ b/CLI_AND_DAEMON.md @@ -168,7 +168,7 @@ Daemon behavior is configured via flags or environment variables: |---------|------|--------------|---------| | Poll interval | `--poll-interval` | `MULTICA_DAEMON_POLL_INTERVAL` | `3s` | | Heartbeat interval | `--heartbeat-interval` | `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` | -| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `2h` | +| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `0` (no cap; bounded by the watchdogs) | | Codex semantic inactivity timeout | `--codex-semantic-inactivity-timeout` | `MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT` | `10m` | | Max concurrent tasks | `--max-concurrent-tasks` | `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` | | Daemon ID | `--daemon-id` | `MULTICA_DAEMON_ID` | hostname | diff --git a/apps/docs/content/docs/cli/reference.zh.mdx b/apps/docs/content/docs/cli/reference.zh.mdx index 28bf22dfa..7bc069ba6 100644 --- a/apps/docs/content/docs/cli/reference.zh.mdx +++ b/apps/docs/content/docs/cli/reference.zh.mdx @@ -115,7 +115,7 @@ Daemon behavior is configured via flags or environment variables: |---------|------|--------------|---------| | Poll interval | `--poll-interval` | `MULTICA_DAEMON_POLL_INTERVAL` | `3s` | | Heartbeat interval | `--heartbeat-interval` | `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` | -| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `2h` | +| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `0`(不限制,由看门狗兜底)| | Max concurrent tasks | `--max-concurrent-tasks` | `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` | | Daemon ID | `--daemon-id` | `MULTICA_DAEMON_ID` | hostname | | Device name | `--device-name` | `MULTICA_DAEMON_DEVICE_NAME` | hostname | diff --git a/apps/docs/content/docs/environment-variables.zh.mdx b/apps/docs/content/docs/environment-variables.zh.mdx index ca4bc989f..d3f8c2c37 100644 --- a/apps/docs/content/docs/environment-variables.zh.mdx +++ b/apps/docs/content/docs/environment-variables.zh.mdx @@ -179,6 +179,9 @@ API 返回的 `download_url` 在未配置 CloudFront 签名时会指向 `GET /ap | `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` | 心跳频率 | | `MULTICA_DAEMON_POLL_INTERVAL` | `3s` | 任务轮询频率 | | `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` | 并发任务上限 | +| `MULTICA_AGENT_TIMEOUT` | `0` | 单次任务的绝对墙钟上限;`0` = 不设上限,任务只受看门狗约束(活跃任务不会因为跑得久被杀)。想要硬性成本/资源天花板时再设一个正值 | +| `MULTICA_AGENT_IDLE_WATCHDOG` | `30m` | 空闲看门狗:backend 持续静默(无消息、消息队列为空、且没有工具在途)这么久就 force-stop。`0` = 关闭整套看门狗 | +| `MULTICA_AGENT_TOOL_WATCHDOG` | `2h` | 工具在途时的静默上限:某个工具调用发出后长时间无任何输出(疑似卡死的子进程)这么久就 force-stop。`0` = 关闭该兜底(在途工具永不被停)| | `MULTICA__PATH` | 对应 CLI 名 | 各 AI 编程工具的可执行文件路径(如 `MULTICA_CLAUDE_PATH`)| | `MULTICA__MODEL` | 空 | 各 AI 编程工具的默认模型 | diff --git a/apps/docs/content/docs/tasks.zh.mdx b/apps/docs/content/docs/tasks.zh.mdx index 4e30e554f..c0663e0f1 100644 --- a/apps/docs/content/docs/tasks.zh.mdx +++ b/apps/docs/content/docs/tasks.zh.mdx @@ -42,6 +42,8 @@ Multica 服务器每 30 秒扫描一次,有两种超时会触发失败: 两种超时的失败原因都是 `timeout`,**会自动重试**(下一节)。关联的运行时失联判定见 [守护进程与运行时 → 运行时什么时候被判定为离线](/daemon-runtimes#运行时什么时候被判定为离线)。 +上面这层是**服务端的粗粒度兜底**——按任务启动时间算,不看任务是否还在活动。真正区分「卡死」和「正常的长任务」的是**本地守护进程**:它不再用固定墙钟时长砍任务(`MULTICA_AGENT_TIMEOUT` 默认 `0` = 不设上限),而是看活动——只要 agent 还在持续产出事件(消息、工具调用),守护进程就不会因为跑得久判它超时(服务端那条 2.5h 仍是外层上限)。只有真正静默卡死时才会被**空闲看门狗**(`MULTICA_AGENT_IDLE_WATCHDOG`,默认 30 分钟)终止;如果是某个工具调用发出后长时间无任何输出(疑似卡死的子进程),则由更大的**工具看门狗**预算(`MULTICA_AGENT_TOOL_WATCHDOG`,默认 2 小时)兜底。这类被看门狗终止的任务失败原因是 `idle_watchdog`,和墙钟 `timeout` 区分开。各参数见 [环境变量 → 守护进程的调节参数](/environment-variables#守护进程的调节参数)。 + ## 哪些失败会自动重试,哪些不会 失败分两类:**可重试**和**不可重试**。 diff --git a/server/cmd/multica/cmd_daemon.go b/server/cmd/multica/cmd_daemon.go index 984cb3193..96123b2b3 100644 --- a/server/cmd/multica/cmd_daemon.go +++ b/server/cmd/multica/cmd_daemon.go @@ -78,7 +78,7 @@ func init() { f.String("runtime-name", "", "Runtime display name (env: MULTICA_AGENT_RUNTIME_NAME)") f.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)") f.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)") - f.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)") + f.Duration("agent-timeout", 0, "Absolute per-task wall-clock cap; 0 = no cap, rely on the watchdogs (env: MULTICA_AGENT_TIMEOUT)") f.Duration("codex-semantic-inactivity-timeout", 0, "Codex semantic inactivity timeout (env: MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT)") f.Int("max-concurrent-tasks", 0, "Max tasks running in parallel (env: MULTICA_DAEMON_MAX_CONCURRENT_TASKS)") f.Bool("no-auto-update", false, "Disable periodic CLI self-update (env: MULTICA_DAEMON_AUTO_UPDATE=false)") @@ -97,7 +97,7 @@ func init() { rf.String("runtime-name", "", "Runtime display name (env: MULTICA_AGENT_RUNTIME_NAME)") rf.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)") rf.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)") - rf.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)") + rf.Duration("agent-timeout", 0, "Absolute per-task wall-clock cap; 0 = no cap, rely on the watchdogs (env: MULTICA_AGENT_TIMEOUT)") rf.Duration("codex-semantic-inactivity-timeout", 0, "Codex semantic inactivity timeout (env: MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT)") rf.Int("max-concurrent-tasks", 0, "Max tasks running in parallel (env: MULTICA_DAEMON_MAX_CONCURRENT_TASKS)") rf.Bool("no-auto-update", false, "Disable periodic CLI self-update (env: MULTICA_DAEMON_AUTO_UPDATE=false)") @@ -284,7 +284,10 @@ func buildDaemonStartArgs(cmd *cobra.Command) []string { if d, _ := cmd.Flags().GetDuration("heartbeat-interval"); d > 0 { args = append(args, "--heartbeat-interval", d.String()) } - if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 { + // Forward agent-timeout when explicitly set, including an explicit 0 + // (= no cap), so it can override an environment MULTICA_AGENT_TIMEOUT. + if cmd.Flags().Changed("agent-timeout") { + d, _ := cmd.Flags().GetDuration("agent-timeout") args = append(args, "--agent-timeout", d.String()) } if d, _ := cmd.Flags().GetDuration("codex-semantic-inactivity-timeout"); d > 0 { @@ -336,8 +339,11 @@ func runDaemonForeground(cmd *cobra.Command) error { if d, _ := cmd.Flags().GetDuration("heartbeat-interval"); d > 0 { overrides.HeartbeatInterval = d } - if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 { - overrides.AgentTimeout = d + // Distinguish "flag not passed" from an explicit `--agent-timeout 0` so a + // user can turn off an env-configured cap from the CLI. + if cmd.Flags().Changed("agent-timeout") { + d, _ := cmd.Flags().GetDuration("agent-timeout") + overrides.AgentTimeout = &d } if d, _ := cmd.Flags().GetDuration("codex-semantic-inactivity-timeout"); d > 0 { overrides.CodexSemanticInactivityTimeout = d diff --git a/server/cmd/server/runtime_sweeper.go b/server/cmd/server/runtime_sweeper.go index 282b16977..87b4b17c4 100644 --- a/server/cmd/server/runtime_sweeper.go +++ b/server/cmd/server/runtime_sweeper.go @@ -36,8 +36,12 @@ const ( // The dispatched→running transition should be near-instant, so 5 minutes // means something went wrong (e.g. StartTask API call failed silently). dispatchTimeoutSeconds = 300.0 - // runningTimeoutSeconds fails tasks stuck in 'running' beyond this. - // The default agent timeout is 2h, so 2.5h gives a generous buffer. + // runningTimeoutSeconds fails tasks stuck in 'running' beyond this. It is a + // coarse server-side backstop keyed on started_at (it does NOT look at task + // activity) — mainly for runs whose daemon died without reporting. The + // daemon itself decides stuck-vs-long-running by activity (idle/tool + // watchdog), so this only needs to sit generously above any realistic single + // run rather than track a per-run wall-clock cap (MUL-3064). runningTimeoutSeconds = 9000.0 // queuedTTLSeconds expires tasks that have been sitting in 'queued' // for longer than this without ever being claimed. This is the cleanup @@ -46,9 +50,8 @@ const ( // tasks already on the queue when a runtime drops off (or that lost // the race against a runtime that went offline mid-tick) need a // time-bounded exit. 2 hours is conservatively above any reasonable - // "queued behind a long-running task" window for an online runtime - // (default agent timeout is 2h, sweeper interval is 30s) so we don't - // expire legitimately-pending work, while still draining the historical + // "queued behind a long-running task" window for an online runtime, so we + // don't expire legitimately-pending work, while still draining the historical // 87k autopilot backlog within ~24h once enabled. queuedTTLSeconds = 2 * 3600.0 // queuedExpireBatchSize caps how many queued rows a single sweeper tick diff --git a/server/internal/daemon/config.go b/server/internal/daemon/config.go index 495a49fd4..084d94292 100644 --- a/server/internal/daemon/config.go +++ b/server/internal/daemon/config.go @@ -15,23 +15,39 @@ import ( ) const ( - DefaultServerURL = "ws://localhost:8080/ws" - DefaultPollInterval = 30 * time.Second - DefaultHeartbeatInterval = 15 * time.Second - DefaultAgentTimeout = 2 * time.Hour + DefaultServerURL = "ws://localhost:8080/ws" + DefaultPollInterval = 30 * time.Second + DefaultHeartbeatInterval = 15 * time.Second + // DefaultAgentTimeout is the optional absolute wall-clock cap on a single + // agent run. 0 = no cap: a run is bounded only by the inactivity watchdogs + // (DefaultAgentIdleWatchdog / DefaultAgentToolWatchdog), so a session that keeps emitting events is + // never killed merely for running long (MUL-3064). Operators who want a + // hard ceiling for cost/resource control can set MULTICA_AGENT_TIMEOUT. + DefaultAgentTimeout = 0 DefaultCodexSemanticInactivityTimeout = 10 * time.Minute // DefaultAgentIdleWatchdog is the per-task safety net that force-stops a // run when the backend has emitted no message for this long AND its // message queue is empty. Backends like Claude Code can hang indefinitely // on a stuck child process (e.g. `docker ps` against a frozen dockerd), - // in which case `cmd.Wait()` never returns and the task sits at "running" - // for its full DefaultAgentTimeout (2 h). The previous 5 min default + // in which case `cmd.Wait()` never returns. With no wall-clock cap + // (DefaultAgentTimeout = 0) such a run would otherwise sit at "running" + // forever, so this watchdog is its sole liveness net. The previous 5 min default // killed legitimate long assistant outputs (e.g. RFC-length writeups) // where the model streams a single message for many minutes without any // daemon-visible activity — see MUL-2300. 30 min keeps the safety net for // truly stuck runs (dockerd hang) while leaving headroom for long writes. // Set MULTICA_AGENT_IDLE_WATCHDOG=0 to disable. - DefaultAgentIdleWatchdog = 30 * time.Minute + DefaultAgentIdleWatchdog = 30 * time.Minute + // DefaultAgentToolWatchdog bounds how long a single tool call may stay in + // flight (tool_use emitted, no tool_result and no other message) before the + // idle watchdog force-stops the run. The idle watchdog ignores its normal + // window while a tool is in flight, because a real build/install/test + // legitimately runs silently for many minutes — but with no wall-clock cap + // (DefaultAgentTimeout = 0) a backend that emits tool_use and never the + // matching tool_result would otherwise run forever. This is the backstop for + // that stuck-tool case (MUL-3064). Set MULTICA_AGENT_TOOL_WATCHDOG=0 to + // disable, in which case an in-flight tool never force-stops the run. + DefaultAgentToolWatchdog = 2 * time.Hour DefaultRuntimeName = "Local Agent" DefaultWorkspaceSyncInterval = 30 * time.Second DefaultHealthPort = 19514 @@ -79,6 +95,7 @@ type Config struct { AgentTimeout time.Duration CodexSemanticInactivityTimeout time.Duration AgentIdleWatchdog time.Duration // force-stop a run when the backend goes silent this long with an empty queue (0 = disabled) + AgentToolWatchdog time.Duration // force-stop a run when a single tool call stays in flight (silent) this long (0 = disabled); backstop for hung tools now that there is no wall-clock cap ClaudeArgs []string CodexArgs []string } @@ -86,11 +103,13 @@ type Config struct { // Overrides allows CLI flags to override environment variables and defaults. // Zero values are ignored and the env/default value is used instead. type Overrides struct { - ServerURL string - WorkspacesRoot string - PollInterval time.Duration - HeartbeatInterval time.Duration - AgentTimeout time.Duration + ServerURL string + WorkspacesRoot string + PollInterval time.Duration + HeartbeatInterval time.Duration + // AgentTimeout is a pointer so an explicit `--agent-timeout 0` (no cap) is + // distinguishable from "flag not passed". nil = use env/default. + AgentTimeout *time.Duration CodexSemanticInactivityTimeout time.Duration MaxConcurrentTasks int DaemonID string @@ -260,8 +279,8 @@ func LoadConfig(overrides Overrides) (Config, error) { if err != nil { return Config{}, err } - if overrides.AgentTimeout > 0 { - agentTimeout = overrides.AgentTimeout + if overrides.AgentTimeout != nil { + agentTimeout = *overrides.AgentTimeout } codexSemanticInactivityTimeout, err := durationFromEnv("MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT", DefaultCodexSemanticInactivityTimeout) @@ -280,6 +299,13 @@ func LoadConfig(overrides Overrides) (Config, error) { return Config{}, err } + // MULTICA_AGENT_TOOL_WATCHDOG=0 disables the in-flight-tool backstop; any + // positive duration overrides DefaultAgentToolWatchdog. + agentToolWatchdog, err := durationFromEnv("MULTICA_AGENT_TOOL_WATCHDOG", DefaultAgentToolWatchdog) + if err != nil { + return Config{}, err + } + maxConcurrentTasks, err := intFromEnv("MULTICA_DAEMON_MAX_CONCURRENT_TASKS", DefaultMaxConcurrentTasks) if err != nil { return Config{}, err @@ -428,6 +454,7 @@ func LoadConfig(overrides Overrides) (Config, error) { AgentTimeout: agentTimeout, CodexSemanticInactivityTimeout: codexSemanticInactivityTimeout, AgentIdleWatchdog: agentIdleWatchdog, + AgentToolWatchdog: agentToolWatchdog, ClaudeArgs: claudeArgs, CodexArgs: codexArgs, }, nil diff --git a/server/internal/daemon/daemon.go b/server/internal/daemon/daemon.go index b9f54ec6d..64b90f152 100644 --- a/server/internal/daemon/daemon.go +++ b/server/internal/daemon/daemon.go @@ -3087,15 +3087,21 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro } taskLog.Debug("backend started, draining messages") - // Create an independent drain deadline so we don't block forever if the - // backend's internal timeout fails to produce a Result (e.g. scanner - // stuck on a hung stdout pipe). The extra 30 s gives the backend time - // to clean up after its own timeout fires. - drainTimeout := opts.Timeout + 30*time.Second - if opts.Timeout == 0 { - drainTimeout = 21 * time.Minute + // Bound the drain loop only when there is a wall-clock cap. With a positive + // opts.Timeout, give the drain a slightly longer deadline than the backend + // so it can still collect the backend's own timeout Result if the scanner + // is stuck on a hung stdout pipe (the extra 30 s covers cleanup after the + // backend's own deadline fires). With no cap (opts.Timeout <= 0) the + // inactivity watchdog is the only liveness net, so the drain must NOT + // impose its own deadline either — otherwise an actively streaming long run + // would be cut off here regardless of progress (MUL-3064). + var drainCtx context.Context + var drainCancel context.CancelFunc + if opts.Timeout > 0 { + drainCtx, drainCancel = context.WithTimeout(agentCtx, opts.Timeout+30*time.Second) + } else { + drainCtx, drainCancel = context.WithCancel(agentCtx) } - drainCtx, drainCancel := context.WithTimeout(agentCtx, drainTimeout) defer drainCancel() var toolCount atomic.Int32 @@ -3110,12 +3116,18 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro // with a matching tool_result. A non-zero count means the agent is // legitimately waiting on a tool (e.g. `npm install`, `docker build`) // that may run far longer than the idle window without emitting any - // message — so the watchdog must not interpret that silence as a hang. + // message — so while a tool is in flight the watchdog applies the larger + // AgentToolWatchdog budget instead of treating that silence as a hang. var inFlightTools atomic.Int32 var idleWatchdogFired atomic.Bool + // idleWatchdogThreshold records (as nanos) which silence budget actually + // tripped the watchdog — the idle window or the larger in-flight-tool + // window — so the failure message reports the real duration. + var idleWatchdogThreshold atomic.Int64 + idleWatchdogThreshold.Store(int64(d.cfg.AgentIdleWatchdog)) idleWindow := d.cfg.AgentIdleWatchdog if idleWindow > 0 { - go d.runIdleWatchdog(agentCtx, idleWindow, &lastActivityAt, &inFlightTools, &idleWatchdogFired, agentCancel, session.Messages, taskLog, taskID) + go d.runIdleWatchdog(agentCtx, idleWindow, d.cfg.AgentToolWatchdog, &lastActivityAt, &inFlightTools, &idleWatchdogFired, &idleWatchdogThreshold, agentCancel, session.Messages, taskLog, taskID) } go func() { @@ -3302,7 +3314,7 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro // generic "agent_error" bucket the aborted path falls into. result.Status = "idle_watchdog" if result.Error == "" { - result.Error = idleWatchdogReason(idleWindow) + result.Error = idleWatchdogReason(time.Duration(idleWatchdogThreshold.Load())) } } return result, toolCount.Load(), nil @@ -3314,7 +3326,7 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro if idleWatchdogFired.Load() { return agent.Result{ Status: "idle_watchdog", - Error: idleWatchdogReason(idleWindow), + Error: idleWatchdogReason(time.Duration(idleWatchdogThreshold.Load())), }, toolCount.Load(), nil } // Distinguish external cancellation (e.g. server-initiated cancel @@ -3343,24 +3355,28 @@ func idleWatchdogReason(window time.Duration) string { } // runIdleWatchdog ticks until either agentCtx is cancelled or the backend has -// been silent for at least window with no in-flight tool call. On firing, it -// sets fired and calls cancel, which propagates to the agent subprocess (via -// the ctx passed to backend.Execute) and to drainCtx. The check requires: +// been silent past the applicable budget. On firing, it records the tripped +// threshold, sets fired, and calls cancel, which propagates to the agent +// subprocess (via the ctx passed to backend.Execute) and to drainCtx. The +// silence budget depends on whether a tool call is in flight: // -// 1. inFlightTools == 0 — the backend has emitted a tool_use whose -// matching tool_result hasn't arrived yet, meaning a real tool (e.g. -// `npm install`, `docker build`) is legitimately running. Long tool -// calls produce no messages between use and result; killing here would -// yank the agent mid-build. AND -// 2. time since lastActivityAt exceeds window — the drain loop is single -// reader, so a stale stamp means no message has actually arrived; AND -// 3. session.Messages buffer is empty — defensive against a hypothetical -// drain stall where unprocessed messages would still imply progress. +// 1. No tool in flight — a silent backend is a hang after `window`. +// 2. A tool in flight (tool_use with no matching tool_result yet) — a real +// tool (e.g. `npm install`, `docker build`) legitimately runs silently for +// many minutes, so the larger `toolWindow` applies instead. toolWindow <= 0 +// keeps the historical behavior of never force-stopping while a tool is in +// flight. Without this in-flight budget a backend that emits tool_use and +// never the matching tool_result would run forever now that there is no +// wall-clock cap (MUL-3064). +// +// In both cases the watchdog also requires the session.Messages buffer to be +// empty — a buffered-but-undrained message means the drain loop is behind, not +// the backend. // // Tick interval is window/2 (floored at 30 s in production, but the floor only // kicks in for windows >= 1 min so tests can pass tiny windows like 50 ms and // see the watchdog fire within a few ticks). -func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration, lastActivityAt *atomic.Int64, inFlightTools *atomic.Int32, fired *atomic.Bool, cancel context.CancelFunc, messages <-chan agent.Message, taskLog *slog.Logger, taskID string) { +func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window, toolWindow time.Duration, lastActivityAt *atomic.Int64, inFlightTools *atomic.Int32, fired *atomic.Bool, firedThreshold *atomic.Int64, cancel context.CancelFunc, messages <-chan agent.Message, taskLog *slog.Logger, taskID string) { interval := window / 2 if window >= time.Minute && interval < 30*time.Second { interval = 30 * time.Second @@ -3375,16 +3391,21 @@ func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration, case <-agentCtx.Done(): return case <-ticker.C: - // In-flight tool call: the agent has emitted tool_use and - // the corresponding tool_result hasn't landed yet. A long - // build/install/test can sit here silently for many minutes - // — that is forward progress, not a hang. - if inFlightTools.Load() > 0 { - continue + // Pick the silence budget. A tool in flight is expected to be + // silent (a long build/install/test emits nothing between + // tool_use and tool_result), so it gets the larger toolWindow; + // toolWindow <= 0 disables the in-flight bound entirely. + threshold := window + toolInFlight := inFlightTools.Load() > 0 + if toolInFlight { + if toolWindow <= 0 { + continue + } + threshold = toolWindow } last := time.Unix(0, lastActivityAt.Load()) idleFor := time.Since(last) - if idleFor < window { + if idleFor < threshold { continue } // A buffered-but-undrained message means the drain loop is @@ -3396,8 +3417,10 @@ func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration, taskLog.Warn("idle watchdog firing: no agent activity, force-stopping run", "task", shortID(taskID), "idle_for", idleFor.Round(time.Second).String(), - "threshold", window.String(), + "threshold", threshold.String(), + "tool_in_flight", toolInFlight, ) + firedThreshold.Store(int64(threshold)) fired.Store(true) cancel() return diff --git a/server/internal/daemon/daemon_test.go b/server/internal/daemon/daemon_test.go index 7f7044787..e3dcb7da9 100644 --- a/server/internal/daemon/daemon_test.go +++ b/server/internal/daemon/daemon_test.go @@ -1096,8 +1096,9 @@ func TestExecuteAndDrain_ContextCancelled_ReportsCancelled(t *testing.T) { // idleWatchdogBackend simulates the MUL-2225 hang: emit one message to mark // activity, then go silent forever. With a short AgentIdleWatchdog, the -// watchdog should fire and short-circuit executeAndDrain instead of waiting -// for the full drainTimeout (which is ~21 minutes by default). +// watchdog should fire and short-circuit executeAndDrain. With no wall-clock +// cap (opts.Timeout = 0) the drain loop imposes no deadline of its own, so the +// idle watchdog is the only thing that ends this otherwise-forever-silent run. type idleWatchdogBackend struct { emitOne bool // when true, emit one message before going silent; when false, never emit anything } @@ -1285,6 +1286,45 @@ func TestExecuteAndDrain_IdleWatchdog_DoesNotFireDuringInFlightToolCall(t *testi } } +// stuckInFlightToolBackend models a hung tool: it emits a tool_use and then +// goes silent forever — the matching tool_result never arrives, so inFlightTools +// stays at 1 (e.g. a child process that never returns). With no wall-clock cap +// (the MUL-3064 default), AgentToolWatchdog is the only thing that ends it. +type stuckInFlightToolBackend struct{} + +func (stuckInFlightToolBackend) Execute(_ context.Context, _ string, _ agent.ExecOptions) (*agent.Session, error) { + msgCh := make(chan agent.Message, 2) + resCh := make(chan agent.Result) + msgCh <- agent.Message{Type: agent.MessageToolUse, Tool: "Bash", CallID: "c1"} + // Deliberately leave msgCh open, never emit tool_result, never write resCh. + return &agent.Session{Messages: msgCh, Result: resCh}, nil +} + +func TestExecuteAndDrain_IdleWatchdog_FiresOnStuckInFlightTool(t *testing.T) { + t.Parallel() + + d := newTestDaemon(t) + // The normal idle window would be skipped while a tool is in flight; the + // AgentToolWatchdog budget is what must fire here. + d.cfg.AgentIdleWatchdog = 50 * time.Millisecond + d.cfg.AgentToolWatchdog = 50 * time.Millisecond + + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + + start := time.Now() + result, _, err := d.executeAndDrain(ctx, stuckInFlightToolBackend{}, "p", agent.ExecOptions{}, slog.Default(), "t-stuck-tool") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Status != "idle_watchdog" { + t.Fatalf("expected status=idle_watchdog for a hung in-flight tool, got %q (err=%q)", result.Status, result.Error) + } + if elapsed := time.Since(start); elapsed > 2*time.Second { + t.Fatalf("tool watchdog took too long to fire: %s (window=%s)", elapsed, d.cfg.AgentToolWatchdog) + } +} + // tailIdleAfterToolBackend exercises the boundary case: a tool call completes, // and THEN the backend goes silent without ever finishing. After the // tool_result lands, in-flight count returns to zero and lastActivityAt is diff --git a/server/pkg/agent/agent.go b/server/pkg/agent/agent.go index 0f0daa8b7..1f34c5812 100644 --- a/server/pkg/agent/agent.go +++ b/server/pkg/agent/agent.go @@ -47,6 +47,19 @@ type ExecOptions struct { ThinkingLevel string } +// runContext derives the execution context for an agent subprocess from the +// configured per-run timeout. A positive timeout imposes a hard wall-clock +// deadline; a zero (or negative) timeout imposes NO deadline, leaving liveness +// entirely to the daemon's inactivity watchdog so a session that keeps emitting +// events is never killed merely for running long (MUL-3064). The caller owns +// the returned CancelFunc and must call it to release resources. +func runContext(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc) { + if timeout > 0 { + return context.WithTimeout(ctx, timeout) + } + return context.WithCancel(ctx) +} + // Session represents a running agent execution. type Session struct { // Messages streams events as the agent works. The channel is closed diff --git a/server/pkg/agent/agent_test.go b/server/pkg/agent/agent_test.go index 32895ed76..a237d6174 100644 --- a/server/pkg/agent/agent_test.go +++ b/server/pkg/agent/agent_test.go @@ -3,6 +3,7 @@ package agent import ( "context" "testing" + "time" ) func TestNewReturnsClaudeBackend(t *testing.T) { @@ -98,3 +99,37 @@ func TestLaunchHeaderReturnsEmptyForUnknownType(t *testing.T) { t.Errorf("expected empty header for unknown type, got %q", header) } } + +func TestRunContextZeroTimeoutHasNoDeadline(t *testing.T) { + t.Parallel() + // A zero (or negative) timeout must NOT impose a wall-clock deadline: + // liveness is delegated to the daemon's inactivity watchdog so an actively + // streaming long-running session is never killed merely for running long + // (MUL-3064). + for _, d := range []time.Duration{0, -time.Second} { + ctx, cancel := runContext(context.Background(), d) + if _, ok := ctx.Deadline(); ok { + cancel() + t.Fatalf("runContext(%s) imposed a deadline; want none", d) + } + cancel() + if ctx.Err() == nil { + t.Fatalf("runContext(%s): context should be cancelled after cancel()", d) + } + } +} + +func TestRunContextPositiveTimeoutHasDeadline(t *testing.T) { + t.Parallel() + // A positive timeout keeps the hard wall-clock deadline (the opt-in + // absolute cap operators can still set via MULTICA_AGENT_TIMEOUT). + ctx, cancel := runContext(context.Background(), time.Hour) + defer cancel() + deadline, ok := ctx.Deadline() + if !ok { + t.Fatal("runContext(1h) should impose a deadline") + } + if remaining := time.Until(deadline); remaining <= 0 || remaining > time.Hour+time.Minute { + t.Fatalf("unexpected deadline remaining: %s", remaining) + } +} diff --git a/server/pkg/agent/antigravity.go b/server/pkg/agent/antigravity.go index 9f1d7457b..3197bda97 100644 --- a/server/pkg/agent/antigravity.go +++ b/server/pkg/agent/antigravity.go @@ -39,10 +39,7 @@ func (b *antigravityBackend) Execute(ctx context.Context, prompt string, opts Ex } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) logFile, err := os.CreateTemp("", "multica-agy-log-*.log") if err != nil { @@ -216,9 +213,16 @@ func buildAntigravityArgs(prompt, logPath string, timeout time.Duration, opts Ex args := []string{ "-p", prompt, "--dangerously-skip-permissions", - "--print-timeout", antigravityFormatTimeout(timeout), - "--log-file", logPath, } + // Only pass --print-timeout when a positive wall-clock cap is configured. + // timeout <= 0 means "no cap" (MUL-3064): agy then runs without its own + // print-timeout guillotine, matching every other backend's runContext + // semantics. Passing antigravityFormatTimeout(0) would clamp to 1s and kill + // the run almost immediately — the opposite of "no cap". + if timeout > 0 { + args = append(args, "--print-timeout", antigravityFormatTimeout(timeout)) + } + args = append(args, "--log-file", logPath) if opts.ResumeSessionID != "" { args = append(args, "--conversation", opts.ResumeSessionID) } diff --git a/server/pkg/agent/antigravity_test.go b/server/pkg/agent/antigravity_test.go index 5144d9813..8877d063b 100644 --- a/server/pkg/agent/antigravity_test.go +++ b/server/pkg/agent/antigravity_test.go @@ -38,6 +38,34 @@ func TestBuildAntigravityArgsBasic(t *testing.T) { } } +func TestBuildAntigravityArgsNoTimeoutOmitsPrintTimeout(t *testing.T) { + t.Parallel() + + // timeout <= 0 means "no wall-clock cap" (MUL-3064): agy must be launched + // WITHOUT --print-timeout, otherwise antigravityFormatTimeout(0) clamps to + // 1s and the run is killed almost immediately — the opposite of "no cap". + args := buildAntigravityArgs( + "hello", + "/tmp/agy.log", + 0, + ExecOptions{Cwd: "/work"}, + quietAntigravityLogger(), + ) + + want := []string{ + "-p", "hello", + "--dangerously-skip-permissions", + "--log-file", "/tmp/agy.log", + "--add-dir", "/work", + } + if !slices.Equal(args, want) { + t.Fatalf("buildAntigravityArgs(timeout=0) mismatch\n got: %v\nwant: %v", args, want) + } + if slices.Contains(args, "--print-timeout") { + t.Fatalf("--print-timeout must be omitted when timeout <= 0; got %v", args) + } +} + func TestBuildAntigravityArgsResume(t *testing.T) { t.Parallel() diff --git a/server/pkg/agent/claude.go b/server/pkg/agent/claude.go index b1dea4be0..033696a2b 100644 --- a/server/pkg/agent/claude.go +++ b/server/pkg/agent/claude.go @@ -30,10 +30,7 @@ func (b *claudeBackend) Execute(ctx context.Context, prompt string, opts ExecOpt } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) args := buildClaudeArgs(opts, b.cfg.Logger) diff --git a/server/pkg/agent/codex.go b/server/pkg/agent/codex.go index 0e2ed833f..4443ea732 100644 --- a/server/pkg/agent/codex.go +++ b/server/pkg/agent/codex.go @@ -499,14 +499,11 @@ func (b *codexBackend) Execute(ctx context.Context, prompt string, opts ExecOpti } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } semanticInactivityTimeout := opts.SemanticInactivityTimeout if semanticInactivityTimeout == 0 { semanticInactivityTimeout = defaultCodexSemanticInactivityTimeout } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) // Materialise the agent's MCP config into the per-task // `$CODEX_HOME/config.toml`. Argv would be the simpler path, but diff --git a/server/pkg/agent/copilot.go b/server/pkg/agent/copilot.go index edf266fd6..23715fa09 100644 --- a/server/pkg/agent/copilot.go +++ b/server/pkg/agent/copilot.go @@ -203,10 +203,7 @@ func (b *copilotBackend) Execute(ctx context.Context, prompt string, opts ExecOp } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) args := buildCopilotArgs(prompt, opts, b.cfg.Logger) argv0, cmdArgs := chooseCopilotInvocation(execName, lookedUp, args, b.cfg.Logger) diff --git a/server/pkg/agent/cursor.go b/server/pkg/agent/cursor.go index f2bfa26e3..b6cebca69 100644 --- a/server/pkg/agent/cursor.go +++ b/server/pkg/agent/cursor.go @@ -31,10 +31,7 @@ func (b *cursorBackend) Execute(ctx context.Context, prompt string, opts ExecOpt } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) args := buildCursorArgs(prompt, opts, b.cfg.Logger) argv0, cmdArgs := chooseCursorInvocation(execName, lookedUp, args, b.cfg.Logger) diff --git a/server/pkg/agent/gemini.go b/server/pkg/agent/gemini.go index b0a6571fb..4e48de6af 100644 --- a/server/pkg/agent/gemini.go +++ b/server/pkg/agent/gemini.go @@ -27,10 +27,7 @@ func (b *geminiBackend) Execute(ctx context.Context, prompt string, opts ExecOpt } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) args := buildGeminiArgs(prompt, opts, b.cfg.Logger) diff --git a/server/pkg/agent/hermes.go b/server/pkg/agent/hermes.go index f5a4eb927..ad03246a4 100644 --- a/server/pkg/agent/hermes.go +++ b/server/pkg/agent/hermes.go @@ -55,10 +55,7 @@ func (b *hermesBackend) Execute(ctx context.Context, prompt string, opts ExecOpt } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) hermesArgs := append([]string{"acp"}, filterCustomArgs(opts.CustomArgs, hermesBlockedArgs, b.cfg.Logger)...) cmd := exec.CommandContext(runCtx, execPath, hermesArgs...) diff --git a/server/pkg/agent/kimi.go b/server/pkg/agent/kimi.go index 9f50e3a24..896b276e7 100644 --- a/server/pkg/agent/kimi.go +++ b/server/pkg/agent/kimi.go @@ -49,10 +49,7 @@ func (b *kimiBackend) Execute(ctx context.Context, prompt string, opts ExecOptio } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) // `kimi acp` ignores --yolo / --auto-approve (they're flags on the // root `kimi` command, not on the `acp` subcommand). Instead, the diff --git a/server/pkg/agent/kiro.go b/server/pkg/agent/kiro.go index 7af230c47..d85f2fea2 100644 --- a/server/pkg/agent/kiro.go +++ b/server/pkg/agent/kiro.go @@ -54,10 +54,7 @@ func (b *kiroBackend) Execute(ctx context.Context, prompt string, opts ExecOptio } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) kiroArgs := append([]string{"acp", "--trust-all-tools"}, filterCustomArgs(opts.CustomArgs, kiroBlockedArgs, b.cfg.Logger)...) cmd := exec.CommandContext(runCtx, execPath, kiroArgs...) diff --git a/server/pkg/agent/openclaw.go b/server/pkg/agent/openclaw.go index 92fe5ec84..1f5ce366a 100644 --- a/server/pkg/agent/openclaw.go +++ b/server/pkg/agent/openclaw.go @@ -66,10 +66,7 @@ func (b *openclawBackend) Execute(ctx context.Context, prompt string, opts ExecO } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) sessionID := opts.ResumeSessionID if sessionID == "" { diff --git a/server/pkg/agent/opencode.go b/server/pkg/agent/opencode.go index e9c758b82..1e6868964 100644 --- a/server/pkg/agent/opencode.go +++ b/server/pkg/agent/opencode.go @@ -47,10 +47,7 @@ func (b *opencodeBackend) Execute(ctx context.Context, prompt string, opts ExecO execPath = resolved timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) args := []string{"run", "--format", "json", "--dangerously-skip-permissions"} // Anchor OpenCode's project discovery (AGENTS.md walk-up + .opencode/skills/ diff --git a/server/pkg/agent/pi.go b/server/pkg/agent/pi.go index da8b02d00..93078585d 100644 --- a/server/pkg/agent/pi.go +++ b/server/pkg/agent/pi.go @@ -184,9 +184,6 @@ func (b *piBackend) Execute(ctx context.Context, prompt string, opts ExecOptions } timeout := opts.Timeout - if timeout == 0 { - timeout = 20 * time.Minute - } // Pi's --session flag expects a file path where events are appended. // The path doubles as our opaque session identifier: we return it as @@ -203,7 +200,7 @@ func (b *piBackend) Execute(ctx context.Context, prompt string, opts ExecOptions return nil, fmt.Errorf("pi session file: %w", err) } - runCtx, cancel := context.WithTimeout(ctx, timeout) + runCtx, cancel := runContext(ctx, timeout) args := buildPiArgs(prompt, sessionPath, opts, b.cfg.Logger) argv0, cmdArgs := choosePiInvocation(execName, lookedUp, args, b.cfg.Logger)