fix(daemon): inactivity-based agent run timeout, no wall-clock guillotine (MUL-3064)

Active long-running sessions are no longer killed by a fixed wall-clock deadline. Liveness is delegated to the idle watchdog (MULTICA_AGENT_IDLE_WATCHDOG, default 30m) with a larger in-flight-tool budget (MULTICA_AGENT_TOOL_WATCHDOG, default 2h). MULTICA_AGENT_TIMEOUT is an opt-in absolute cap (default 0 = no cap). The server-side 2.5h sweeper is unchanged as a coarse backstop.

Fixes #3745.
This commit is contained in:
Bohan Jiang
2026-06-05 15:06:07 +08:00
committed by GitHub
parent d6540a1869
commit 3708fb0f07
24 changed files with 262 additions and 111 deletions

View File

@@ -168,7 +168,7 @@ Daemon behavior is configured via flags or environment variables:
|---------|------|--------------|---------|
| Poll interval | `--poll-interval` | `MULTICA_DAEMON_POLL_INTERVAL` | `3s` |
| Heartbeat interval | `--heartbeat-interval` | `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` |
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `2h` |
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `0` (no cap; bounded by the watchdogs) |
| Codex semantic inactivity timeout | `--codex-semantic-inactivity-timeout` | `MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT` | `10m` |
| Max concurrent tasks | `--max-concurrent-tasks` | `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` |
| Daemon ID | `--daemon-id` | `MULTICA_DAEMON_ID` | hostname |

View File

@@ -115,7 +115,7 @@ Daemon behavior is configured via flags or environment variables:
|---------|------|--------------|---------|
| Poll interval | `--poll-interval` | `MULTICA_DAEMON_POLL_INTERVAL` | `3s` |
| Heartbeat interval | `--heartbeat-interval` | `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` |
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `2h` |
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `0`(不限制,由看门狗兜底)|
| Max concurrent tasks | `--max-concurrent-tasks` | `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` |
| Daemon ID | `--daemon-id` | `MULTICA_DAEMON_ID` | hostname |
| Device name | `--device-name` | `MULTICA_DAEMON_DEVICE_NAME` | hostname |

View File

@@ -179,6 +179,9 @@ API 返回的 `download_url` 在未配置 CloudFront 签名时会指向 `GET /ap
| `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` | 心跳频率 |
| `MULTICA_DAEMON_POLL_INTERVAL` | `3s` | 任务轮询频率 |
| `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` | 并发任务上限 |
| `MULTICA_AGENT_TIMEOUT` | `0` | 单次任务的绝对墙钟上限;`0` = 不设上限,任务只受看门狗约束(活跃任务不会因为跑得久被杀)。想要硬性成本/资源天花板时再设一个正值 |
| `MULTICA_AGENT_IDLE_WATCHDOG` | `30m` | 空闲看门狗backend 持续静默(无消息、消息队列为空、且没有工具在途)这么久就 force-stop。`0` = 关闭整套看门狗 |
| `MULTICA_AGENT_TOOL_WATCHDOG` | `2h` | 工具在途时的静默上限:某个工具调用发出后长时间无任何输出(疑似卡死的子进程)这么久就 force-stop。`0` = 关闭该兜底(在途工具永不被停)|
| `MULTICA_<PROVIDER>_PATH` | 对应 CLI 名 | 各 AI 编程工具的可执行文件路径(如 `MULTICA_CLAUDE_PATH`|
| `MULTICA_<PROVIDER>_MODEL` | 空 | 各 AI 编程工具的默认模型 |

View File

@@ -42,6 +42,8 @@ Multica 服务器每 30 秒扫描一次,有两种超时会触发失败:
两种超时的失败原因都是 `timeout`**会自动重试**(下一节)。关联的运行时失联判定见 [守护进程与运行时 → 运行时什么时候被判定为离线](/daemon-runtimes#运行时什么时候被判定为离线)。
上面这层是**服务端的粗粒度兜底**——按任务启动时间算,不看任务是否还在活动。真正区分「卡死」和「正常的长任务」的是**本地守护进程**:它不再用固定墙钟时长砍任务(`MULTICA_AGENT_TIMEOUT` 默认 `0` = 不设上限),而是看活动——只要 agent 还在持续产出事件(消息、工具调用),守护进程就不会因为跑得久判它超时(服务端那条 2.5h 仍是外层上限)。只有真正静默卡死时才会被**空闲看门狗**`MULTICA_AGENT_IDLE_WATCHDOG`,默认 30 分钟)终止;如果是某个工具调用发出后长时间无任何输出(疑似卡死的子进程),则由更大的**工具看门狗**预算(`MULTICA_AGENT_TOOL_WATCHDOG`,默认 2 小时)兜底。这类被看门狗终止的任务失败原因是 `idle_watchdog`,和墙钟 `timeout` 区分开。各参数见 [环境变量 → 守护进程的调节参数](/environment-variables#守护进程的调节参数)。
## 哪些失败会自动重试,哪些不会
失败分两类:**可重试**和**不可重试**。

View File

@@ -78,7 +78,7 @@ func init() {
f.String("runtime-name", "", "Runtime display name (env: MULTICA_AGENT_RUNTIME_NAME)")
f.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)")
f.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)")
f.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)")
f.Duration("agent-timeout", 0, "Absolute per-task wall-clock cap; 0 = no cap, rely on the watchdogs (env: MULTICA_AGENT_TIMEOUT)")
f.Duration("codex-semantic-inactivity-timeout", 0, "Codex semantic inactivity timeout (env: MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT)")
f.Int("max-concurrent-tasks", 0, "Max tasks running in parallel (env: MULTICA_DAEMON_MAX_CONCURRENT_TASKS)")
f.Bool("no-auto-update", false, "Disable periodic CLI self-update (env: MULTICA_DAEMON_AUTO_UPDATE=false)")
@@ -97,7 +97,7 @@ func init() {
rf.String("runtime-name", "", "Runtime display name (env: MULTICA_AGENT_RUNTIME_NAME)")
rf.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)")
rf.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)")
rf.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)")
rf.Duration("agent-timeout", 0, "Absolute per-task wall-clock cap; 0 = no cap, rely on the watchdogs (env: MULTICA_AGENT_TIMEOUT)")
rf.Duration("codex-semantic-inactivity-timeout", 0, "Codex semantic inactivity timeout (env: MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT)")
rf.Int("max-concurrent-tasks", 0, "Max tasks running in parallel (env: MULTICA_DAEMON_MAX_CONCURRENT_TASKS)")
rf.Bool("no-auto-update", false, "Disable periodic CLI self-update (env: MULTICA_DAEMON_AUTO_UPDATE=false)")
@@ -284,7 +284,10 @@ func buildDaemonStartArgs(cmd *cobra.Command) []string {
if d, _ := cmd.Flags().GetDuration("heartbeat-interval"); d > 0 {
args = append(args, "--heartbeat-interval", d.String())
}
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
// Forward agent-timeout when explicitly set, including an explicit 0
// (= no cap), so it can override an environment MULTICA_AGENT_TIMEOUT.
if cmd.Flags().Changed("agent-timeout") {
d, _ := cmd.Flags().GetDuration("agent-timeout")
args = append(args, "--agent-timeout", d.String())
}
if d, _ := cmd.Flags().GetDuration("codex-semantic-inactivity-timeout"); d > 0 {
@@ -336,8 +339,11 @@ func runDaemonForeground(cmd *cobra.Command) error {
if d, _ := cmd.Flags().GetDuration("heartbeat-interval"); d > 0 {
overrides.HeartbeatInterval = d
}
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
overrides.AgentTimeout = d
// Distinguish "flag not passed" from an explicit `--agent-timeout 0` so a
// user can turn off an env-configured cap from the CLI.
if cmd.Flags().Changed("agent-timeout") {
d, _ := cmd.Flags().GetDuration("agent-timeout")
overrides.AgentTimeout = &d
}
if d, _ := cmd.Flags().GetDuration("codex-semantic-inactivity-timeout"); d > 0 {
overrides.CodexSemanticInactivityTimeout = d

View File

@@ -36,8 +36,12 @@ const (
// The dispatched→running transition should be near-instant, so 5 minutes
// means something went wrong (e.g. StartTask API call failed silently).
dispatchTimeoutSeconds = 300.0
// runningTimeoutSeconds fails tasks stuck in 'running' beyond this.
// The default agent timeout is 2h, so 2.5h gives a generous buffer.
// runningTimeoutSeconds fails tasks stuck in 'running' beyond this. It is a
// coarse server-side backstop keyed on started_at (it does NOT look at task
// activity) — mainly for runs whose daemon died without reporting. The
// daemon itself decides stuck-vs-long-running by activity (idle/tool
// watchdog), so this only needs to sit generously above any realistic single
// run rather than track a per-run wall-clock cap (MUL-3064).
runningTimeoutSeconds = 9000.0
// queuedTTLSeconds expires tasks that have been sitting in 'queued'
// for longer than this without ever being claimed. This is the cleanup
@@ -46,9 +50,8 @@ const (
// tasks already on the queue when a runtime drops off (or that lost
// the race against a runtime that went offline mid-tick) need a
// time-bounded exit. 2 hours is conservatively above any reasonable
// "queued behind a long-running task" window for an online runtime
// (default agent timeout is 2h, sweeper interval is 30s) so we don't
// expire legitimately-pending work, while still draining the historical
// "queued behind a long-running task" window for an online runtime, so we
// don't expire legitimately-pending work, while still draining the historical
// 87k autopilot backlog within ~24h once enabled.
queuedTTLSeconds = 2 * 3600.0
// queuedExpireBatchSize caps how many queued rows a single sweeper tick

View File

@@ -15,23 +15,39 @@ import (
)
const (
DefaultServerURL = "ws://localhost:8080/ws"
DefaultPollInterval = 30 * time.Second
DefaultHeartbeatInterval = 15 * time.Second
DefaultAgentTimeout = 2 * time.Hour
DefaultServerURL = "ws://localhost:8080/ws"
DefaultPollInterval = 30 * time.Second
DefaultHeartbeatInterval = 15 * time.Second
// DefaultAgentTimeout is the optional absolute wall-clock cap on a single
// agent run. 0 = no cap: a run is bounded only by the inactivity watchdogs
// (DefaultAgentIdleWatchdog / DefaultAgentToolWatchdog), so a session that keeps emitting events is
// never killed merely for running long (MUL-3064). Operators who want a
// hard ceiling for cost/resource control can set MULTICA_AGENT_TIMEOUT.
DefaultAgentTimeout = 0
DefaultCodexSemanticInactivityTimeout = 10 * time.Minute
// DefaultAgentIdleWatchdog is the per-task safety net that force-stops a
// run when the backend has emitted no message for this long AND its
// message queue is empty. Backends like Claude Code can hang indefinitely
// on a stuck child process (e.g. `docker ps` against a frozen dockerd),
// in which case `cmd.Wait()` never returns and the task sits at "running"
// for its full DefaultAgentTimeout (2 h). The previous 5 min default
// in which case `cmd.Wait()` never returns. With no wall-clock cap
// (DefaultAgentTimeout = 0) such a run would otherwise sit at "running"
// forever, so this watchdog is its sole liveness net. The previous 5 min default
// killed legitimate long assistant outputs (e.g. RFC-length writeups)
// where the model streams a single message for many minutes without any
// daemon-visible activity — see MUL-2300. 30 min keeps the safety net for
// truly stuck runs (dockerd hang) while leaving headroom for long writes.
// Set MULTICA_AGENT_IDLE_WATCHDOG=0 to disable.
DefaultAgentIdleWatchdog = 30 * time.Minute
DefaultAgentIdleWatchdog = 30 * time.Minute
// DefaultAgentToolWatchdog bounds how long a single tool call may stay in
// flight (tool_use emitted, no tool_result and no other message) before the
// idle watchdog force-stops the run. The idle watchdog ignores its normal
// window while a tool is in flight, because a real build/install/test
// legitimately runs silently for many minutes — but with no wall-clock cap
// (DefaultAgentTimeout = 0) a backend that emits tool_use and never the
// matching tool_result would otherwise run forever. This is the backstop for
// that stuck-tool case (MUL-3064). Set MULTICA_AGENT_TOOL_WATCHDOG=0 to
// disable, in which case an in-flight tool never force-stops the run.
DefaultAgentToolWatchdog = 2 * time.Hour
DefaultRuntimeName = "Local Agent"
DefaultWorkspaceSyncInterval = 30 * time.Second
DefaultHealthPort = 19514
@@ -79,6 +95,7 @@ type Config struct {
AgentTimeout time.Duration
CodexSemanticInactivityTimeout time.Duration
AgentIdleWatchdog time.Duration // force-stop a run when the backend goes silent this long with an empty queue (0 = disabled)
AgentToolWatchdog time.Duration // force-stop a run when a single tool call stays in flight (silent) this long (0 = disabled); backstop for hung tools now that there is no wall-clock cap
ClaudeArgs []string
CodexArgs []string
}
@@ -86,11 +103,13 @@ type Config struct {
// Overrides allows CLI flags to override environment variables and defaults.
// Zero values are ignored and the env/default value is used instead.
type Overrides struct {
ServerURL string
WorkspacesRoot string
PollInterval time.Duration
HeartbeatInterval time.Duration
AgentTimeout time.Duration
ServerURL string
WorkspacesRoot string
PollInterval time.Duration
HeartbeatInterval time.Duration
// AgentTimeout is a pointer so an explicit `--agent-timeout 0` (no cap) is
// distinguishable from "flag not passed". nil = use env/default.
AgentTimeout *time.Duration
CodexSemanticInactivityTimeout time.Duration
MaxConcurrentTasks int
DaemonID string
@@ -260,8 +279,8 @@ func LoadConfig(overrides Overrides) (Config, error) {
if err != nil {
return Config{}, err
}
if overrides.AgentTimeout > 0 {
agentTimeout = overrides.AgentTimeout
if overrides.AgentTimeout != nil {
agentTimeout = *overrides.AgentTimeout
}
codexSemanticInactivityTimeout, err := durationFromEnv("MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT", DefaultCodexSemanticInactivityTimeout)
@@ -280,6 +299,13 @@ func LoadConfig(overrides Overrides) (Config, error) {
return Config{}, err
}
// MULTICA_AGENT_TOOL_WATCHDOG=0 disables the in-flight-tool backstop; any
// positive duration overrides DefaultAgentToolWatchdog.
agentToolWatchdog, err := durationFromEnv("MULTICA_AGENT_TOOL_WATCHDOG", DefaultAgentToolWatchdog)
if err != nil {
return Config{}, err
}
maxConcurrentTasks, err := intFromEnv("MULTICA_DAEMON_MAX_CONCURRENT_TASKS", DefaultMaxConcurrentTasks)
if err != nil {
return Config{}, err
@@ -428,6 +454,7 @@ func LoadConfig(overrides Overrides) (Config, error) {
AgentTimeout: agentTimeout,
CodexSemanticInactivityTimeout: codexSemanticInactivityTimeout,
AgentIdleWatchdog: agentIdleWatchdog,
AgentToolWatchdog: agentToolWatchdog,
ClaudeArgs: claudeArgs,
CodexArgs: codexArgs,
}, nil

View File

@@ -3087,15 +3087,21 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
}
taskLog.Debug("backend started, draining messages")
// Create an independent drain deadline so we don't block forever if the
// backend's internal timeout fails to produce a Result (e.g. scanner
// stuck on a hung stdout pipe). The extra 30 s gives the backend time
// to clean up after its own timeout fires.
drainTimeout := opts.Timeout + 30*time.Second
if opts.Timeout == 0 {
drainTimeout = 21 * time.Minute
// Bound the drain loop only when there is a wall-clock cap. With a positive
// opts.Timeout, give the drain a slightly longer deadline than the backend
// so it can still collect the backend's own timeout Result if the scanner
// is stuck on a hung stdout pipe (the extra 30 s covers cleanup after the
// backend's own deadline fires). With no cap (opts.Timeout <= 0) the
// inactivity watchdog is the only liveness net, so the drain must NOT
// impose its own deadline either — otherwise an actively streaming long run
// would be cut off here regardless of progress (MUL-3064).
var drainCtx context.Context
var drainCancel context.CancelFunc
if opts.Timeout > 0 {
drainCtx, drainCancel = context.WithTimeout(agentCtx, opts.Timeout+30*time.Second)
} else {
drainCtx, drainCancel = context.WithCancel(agentCtx)
}
drainCtx, drainCancel := context.WithTimeout(agentCtx, drainTimeout)
defer drainCancel()
var toolCount atomic.Int32
@@ -3110,12 +3116,18 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
// with a matching tool_result. A non-zero count means the agent is
// legitimately waiting on a tool (e.g. `npm install`, `docker build`)
// that may run far longer than the idle window without emitting any
// message — so the watchdog must not interpret that silence as a hang.
// message — so while a tool is in flight the watchdog applies the larger
// AgentToolWatchdog budget instead of treating that silence as a hang.
var inFlightTools atomic.Int32
var idleWatchdogFired atomic.Bool
// idleWatchdogThreshold records (as nanos) which silence budget actually
// tripped the watchdog — the idle window or the larger in-flight-tool
// window — so the failure message reports the real duration.
var idleWatchdogThreshold atomic.Int64
idleWatchdogThreshold.Store(int64(d.cfg.AgentIdleWatchdog))
idleWindow := d.cfg.AgentIdleWatchdog
if idleWindow > 0 {
go d.runIdleWatchdog(agentCtx, idleWindow, &lastActivityAt, &inFlightTools, &idleWatchdogFired, agentCancel, session.Messages, taskLog, taskID)
go d.runIdleWatchdog(agentCtx, idleWindow, d.cfg.AgentToolWatchdog, &lastActivityAt, &inFlightTools, &idleWatchdogFired, &idleWatchdogThreshold, agentCancel, session.Messages, taskLog, taskID)
}
go func() {
@@ -3302,7 +3314,7 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
// generic "agent_error" bucket the aborted path falls into.
result.Status = "idle_watchdog"
if result.Error == "" {
result.Error = idleWatchdogReason(idleWindow)
result.Error = idleWatchdogReason(time.Duration(idleWatchdogThreshold.Load()))
}
}
return result, toolCount.Load(), nil
@@ -3314,7 +3326,7 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
if idleWatchdogFired.Load() {
return agent.Result{
Status: "idle_watchdog",
Error: idleWatchdogReason(idleWindow),
Error: idleWatchdogReason(time.Duration(idleWatchdogThreshold.Load())),
}, toolCount.Load(), nil
}
// Distinguish external cancellation (e.g. server-initiated cancel
@@ -3343,24 +3355,28 @@ func idleWatchdogReason(window time.Duration) string {
}
// runIdleWatchdog ticks until either agentCtx is cancelled or the backend has
// been silent for at least window with no in-flight tool call. On firing, it
// sets fired and calls cancel, which propagates to the agent subprocess (via
// the ctx passed to backend.Execute) and to drainCtx. The check requires:
// been silent past the applicable budget. On firing, it records the tripped
// threshold, sets fired, and calls cancel, which propagates to the agent
// subprocess (via the ctx passed to backend.Execute) and to drainCtx. The
// silence budget depends on whether a tool call is in flight:
//
// 1. inFlightTools == 0 — the backend has emitted a tool_use whose
// matching tool_result hasn't arrived yet, meaning a real tool (e.g.
// `npm install`, `docker build`) is legitimately running. Long tool
// calls produce no messages between use and result; killing here would
// yank the agent mid-build. AND
// 2. time since lastActivityAt exceeds window — the drain loop is single
// reader, so a stale stamp means no message has actually arrived; AND
// 3. session.Messages buffer is empty — defensive against a hypothetical
// drain stall where unprocessed messages would still imply progress.
// 1. No tool in flight — a silent backend is a hang after `window`.
// 2. A tool in flight (tool_use with no matching tool_result yet) — a real
// tool (e.g. `npm install`, `docker build`) legitimately runs silently for
// many minutes, so the larger `toolWindow` applies instead. toolWindow <= 0
// keeps the historical behavior of never force-stopping while a tool is in
// flight. Without this in-flight budget a backend that emits tool_use and
// never the matching tool_result would run forever now that there is no
// wall-clock cap (MUL-3064).
//
// In both cases the watchdog also requires the session.Messages buffer to be
// empty — a buffered-but-undrained message means the drain loop is behind, not
// the backend.
//
// Tick interval is window/2 (floored at 30 s in production, but the floor only
// kicks in for windows >= 1 min so tests can pass tiny windows like 50 ms and
// see the watchdog fire within a few ticks).
func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration, lastActivityAt *atomic.Int64, inFlightTools *atomic.Int32, fired *atomic.Bool, cancel context.CancelFunc, messages <-chan agent.Message, taskLog *slog.Logger, taskID string) {
func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window, toolWindow time.Duration, lastActivityAt *atomic.Int64, inFlightTools *atomic.Int32, fired *atomic.Bool, firedThreshold *atomic.Int64, cancel context.CancelFunc, messages <-chan agent.Message, taskLog *slog.Logger, taskID string) {
interval := window / 2
if window >= time.Minute && interval < 30*time.Second {
interval = 30 * time.Second
@@ -3375,16 +3391,21 @@ func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration,
case <-agentCtx.Done():
return
case <-ticker.C:
// In-flight tool call: the agent has emitted tool_use and
// the corresponding tool_result hasn't landed yet. A long
// build/install/test can sit here silently for many minutes
// — that is forward progress, not a hang.
if inFlightTools.Load() > 0 {
continue
// Pick the silence budget. A tool in flight is expected to be
// silent (a long build/install/test emits nothing between
// tool_use and tool_result), so it gets the larger toolWindow;
// toolWindow <= 0 disables the in-flight bound entirely.
threshold := window
toolInFlight := inFlightTools.Load() > 0
if toolInFlight {
if toolWindow <= 0 {
continue
}
threshold = toolWindow
}
last := time.Unix(0, lastActivityAt.Load())
idleFor := time.Since(last)
if idleFor < window {
if idleFor < threshold {
continue
}
// A buffered-but-undrained message means the drain loop is
@@ -3396,8 +3417,10 @@ func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration,
taskLog.Warn("idle watchdog firing: no agent activity, force-stopping run",
"task", shortID(taskID),
"idle_for", idleFor.Round(time.Second).String(),
"threshold", window.String(),
"threshold", threshold.String(),
"tool_in_flight", toolInFlight,
)
firedThreshold.Store(int64(threshold))
fired.Store(true)
cancel()
return

View File

@@ -1096,8 +1096,9 @@ func TestExecuteAndDrain_ContextCancelled_ReportsCancelled(t *testing.T) {
// idleWatchdogBackend simulates the MUL-2225 hang: emit one message to mark
// activity, then go silent forever. With a short AgentIdleWatchdog, the
// watchdog should fire and short-circuit executeAndDrain instead of waiting
// for the full drainTimeout (which is ~21 minutes by default).
// watchdog should fire and short-circuit executeAndDrain. With no wall-clock
// cap (opts.Timeout = 0) the drain loop imposes no deadline of its own, so the
// idle watchdog is the only thing that ends this otherwise-forever-silent run.
type idleWatchdogBackend struct {
emitOne bool // when true, emit one message before going silent; when false, never emit anything
}
@@ -1285,6 +1286,45 @@ func TestExecuteAndDrain_IdleWatchdog_DoesNotFireDuringInFlightToolCall(t *testi
}
}
// stuckInFlightToolBackend models a hung tool: it emits a tool_use and then
// goes silent forever — the matching tool_result never arrives, so inFlightTools
// stays at 1 (e.g. a child process that never returns). With no wall-clock cap
// (the MUL-3064 default), AgentToolWatchdog is the only thing that ends it.
type stuckInFlightToolBackend struct{}
func (stuckInFlightToolBackend) Execute(_ context.Context, _ string, _ agent.ExecOptions) (*agent.Session, error) {
msgCh := make(chan agent.Message, 2)
resCh := make(chan agent.Result)
msgCh <- agent.Message{Type: agent.MessageToolUse, Tool: "Bash", CallID: "c1"}
// Deliberately leave msgCh open, never emit tool_result, never write resCh.
return &agent.Session{Messages: msgCh, Result: resCh}, nil
}
func TestExecuteAndDrain_IdleWatchdog_FiresOnStuckInFlightTool(t *testing.T) {
t.Parallel()
d := newTestDaemon(t)
// The normal idle window would be skipped while a tool is in flight; the
// AgentToolWatchdog budget is what must fire here.
d.cfg.AgentIdleWatchdog = 50 * time.Millisecond
d.cfg.AgentToolWatchdog = 50 * time.Millisecond
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
start := time.Now()
result, _, err := d.executeAndDrain(ctx, stuckInFlightToolBackend{}, "p", agent.ExecOptions{}, slog.Default(), "t-stuck-tool")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if result.Status != "idle_watchdog" {
t.Fatalf("expected status=idle_watchdog for a hung in-flight tool, got %q (err=%q)", result.Status, result.Error)
}
if elapsed := time.Since(start); elapsed > 2*time.Second {
t.Fatalf("tool watchdog took too long to fire: %s (window=%s)", elapsed, d.cfg.AgentToolWatchdog)
}
}
// tailIdleAfterToolBackend exercises the boundary case: a tool call completes,
// and THEN the backend goes silent without ever finishing. After the
// tool_result lands, in-flight count returns to zero and lastActivityAt is

View File

@@ -47,6 +47,19 @@ type ExecOptions struct {
ThinkingLevel string
}
// runContext derives the execution context for an agent subprocess from the
// configured per-run timeout. A positive timeout imposes a hard wall-clock
// deadline; a zero (or negative) timeout imposes NO deadline, leaving liveness
// entirely to the daemon's inactivity watchdog so a session that keeps emitting
// events is never killed merely for running long (MUL-3064). The caller owns
// the returned CancelFunc and must call it to release resources.
func runContext(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc) {
if timeout > 0 {
return context.WithTimeout(ctx, timeout)
}
return context.WithCancel(ctx)
}
// Session represents a running agent execution.
type Session struct {
// Messages streams events as the agent works. The channel is closed

View File

@@ -3,6 +3,7 @@ package agent
import (
"context"
"testing"
"time"
)
func TestNewReturnsClaudeBackend(t *testing.T) {
@@ -98,3 +99,37 @@ func TestLaunchHeaderReturnsEmptyForUnknownType(t *testing.T) {
t.Errorf("expected empty header for unknown type, got %q", header)
}
}
func TestRunContextZeroTimeoutHasNoDeadline(t *testing.T) {
t.Parallel()
// A zero (or negative) timeout must NOT impose a wall-clock deadline:
// liveness is delegated to the daemon's inactivity watchdog so an actively
// streaming long-running session is never killed merely for running long
// (MUL-3064).
for _, d := range []time.Duration{0, -time.Second} {
ctx, cancel := runContext(context.Background(), d)
if _, ok := ctx.Deadline(); ok {
cancel()
t.Fatalf("runContext(%s) imposed a deadline; want none", d)
}
cancel()
if ctx.Err() == nil {
t.Fatalf("runContext(%s): context should be cancelled after cancel()", d)
}
}
}
func TestRunContextPositiveTimeoutHasDeadline(t *testing.T) {
t.Parallel()
// A positive timeout keeps the hard wall-clock deadline (the opt-in
// absolute cap operators can still set via MULTICA_AGENT_TIMEOUT).
ctx, cancel := runContext(context.Background(), time.Hour)
defer cancel()
deadline, ok := ctx.Deadline()
if !ok {
t.Fatal("runContext(1h) should impose a deadline")
}
if remaining := time.Until(deadline); remaining <= 0 || remaining > time.Hour+time.Minute {
t.Fatalf("unexpected deadline remaining: %s", remaining)
}
}

View File

@@ -39,10 +39,7 @@ func (b *antigravityBackend) Execute(ctx context.Context, prompt string, opts Ex
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
logFile, err := os.CreateTemp("", "multica-agy-log-*.log")
if err != nil {
@@ -216,9 +213,16 @@ func buildAntigravityArgs(prompt, logPath string, timeout time.Duration, opts Ex
args := []string{
"-p", prompt,
"--dangerously-skip-permissions",
"--print-timeout", antigravityFormatTimeout(timeout),
"--log-file", logPath,
}
// Only pass --print-timeout when a positive wall-clock cap is configured.
// timeout <= 0 means "no cap" (MUL-3064): agy then runs without its own
// print-timeout guillotine, matching every other backend's runContext
// semantics. Passing antigravityFormatTimeout(0) would clamp to 1s and kill
// the run almost immediately — the opposite of "no cap".
if timeout > 0 {
args = append(args, "--print-timeout", antigravityFormatTimeout(timeout))
}
args = append(args, "--log-file", logPath)
if opts.ResumeSessionID != "" {
args = append(args, "--conversation", opts.ResumeSessionID)
}

View File

@@ -38,6 +38,34 @@ func TestBuildAntigravityArgsBasic(t *testing.T) {
}
}
func TestBuildAntigravityArgsNoTimeoutOmitsPrintTimeout(t *testing.T) {
t.Parallel()
// timeout <= 0 means "no wall-clock cap" (MUL-3064): agy must be launched
// WITHOUT --print-timeout, otherwise antigravityFormatTimeout(0) clamps to
// 1s and the run is killed almost immediately — the opposite of "no cap".
args := buildAntigravityArgs(
"hello",
"/tmp/agy.log",
0,
ExecOptions{Cwd: "/work"},
quietAntigravityLogger(),
)
want := []string{
"-p", "hello",
"--dangerously-skip-permissions",
"--log-file", "/tmp/agy.log",
"--add-dir", "/work",
}
if !slices.Equal(args, want) {
t.Fatalf("buildAntigravityArgs(timeout=0) mismatch\n got: %v\nwant: %v", args, want)
}
if slices.Contains(args, "--print-timeout") {
t.Fatalf("--print-timeout must be omitted when timeout <= 0; got %v", args)
}
}
func TestBuildAntigravityArgsResume(t *testing.T) {
t.Parallel()

View File

@@ -30,10 +30,7 @@ func (b *claudeBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
args := buildClaudeArgs(opts, b.cfg.Logger)

View File

@@ -499,14 +499,11 @@ func (b *codexBackend) Execute(ctx context.Context, prompt string, opts ExecOpti
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
semanticInactivityTimeout := opts.SemanticInactivityTimeout
if semanticInactivityTimeout == 0 {
semanticInactivityTimeout = defaultCodexSemanticInactivityTimeout
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
// Materialise the agent's MCP config into the per-task
// `$CODEX_HOME/config.toml`. Argv would be the simpler path, but

View File

@@ -203,10 +203,7 @@ func (b *copilotBackend) Execute(ctx context.Context, prompt string, opts ExecOp
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
args := buildCopilotArgs(prompt, opts, b.cfg.Logger)
argv0, cmdArgs := chooseCopilotInvocation(execName, lookedUp, args, b.cfg.Logger)

View File

@@ -31,10 +31,7 @@ func (b *cursorBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
args := buildCursorArgs(prompt, opts, b.cfg.Logger)
argv0, cmdArgs := chooseCursorInvocation(execName, lookedUp, args, b.cfg.Logger)

View File

@@ -27,10 +27,7 @@ func (b *geminiBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
args := buildGeminiArgs(prompt, opts, b.cfg.Logger)

View File

@@ -55,10 +55,7 @@ func (b *hermesBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
hermesArgs := append([]string{"acp"}, filterCustomArgs(opts.CustomArgs, hermesBlockedArgs, b.cfg.Logger)...)
cmd := exec.CommandContext(runCtx, execPath, hermesArgs...)

View File

@@ -49,10 +49,7 @@ func (b *kimiBackend) Execute(ctx context.Context, prompt string, opts ExecOptio
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
// `kimi acp` ignores --yolo / --auto-approve (they're flags on the
// root `kimi` command, not on the `acp` subcommand). Instead, the

View File

@@ -54,10 +54,7 @@ func (b *kiroBackend) Execute(ctx context.Context, prompt string, opts ExecOptio
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
kiroArgs := append([]string{"acp", "--trust-all-tools"}, filterCustomArgs(opts.CustomArgs, kiroBlockedArgs, b.cfg.Logger)...)
cmd := exec.CommandContext(runCtx, execPath, kiroArgs...)

View File

@@ -66,10 +66,7 @@ func (b *openclawBackend) Execute(ctx context.Context, prompt string, opts ExecO
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
sessionID := opts.ResumeSessionID
if sessionID == "" {

View File

@@ -47,10 +47,7 @@ func (b *opencodeBackend) Execute(ctx context.Context, prompt string, opts ExecO
execPath = resolved
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
args := []string{"run", "--format", "json", "--dangerously-skip-permissions"}
// Anchor OpenCode's project discovery (AGENTS.md walk-up + .opencode/skills/

View File

@@ -184,9 +184,6 @@ func (b *piBackend) Execute(ctx context.Context, prompt string, opts ExecOptions
}
timeout := opts.Timeout
if timeout == 0 {
timeout = 20 * time.Minute
}
// Pi's --session flag expects a file path where events are appended.
// The path doubles as our opaque session identifier: we return it as
@@ -203,7 +200,7 @@ func (b *piBackend) Execute(ctx context.Context, prompt string, opts ExecOptions
return nil, fmt.Errorf("pi session file: %w", err)
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
runCtx, cancel := runContext(ctx, timeout)
args := buildPiArgs(prompt, sessionPath, opts, b.cfg.Logger)
argv0, cmdArgs := choosePiInvocation(execName, lookedUp, args, b.cfg.Logger)