mirror of
https://github.com/multica-ai/multica.git
synced 2026-06-16 19:29:26 +02:00
fix(daemon): inactivity-based agent run timeout, no wall-clock guillotine (MUL-3064)
Active long-running sessions are no longer killed by a fixed wall-clock deadline. Liveness is delegated to the idle watchdog (MULTICA_AGENT_IDLE_WATCHDOG, default 30m) with a larger in-flight-tool budget (MULTICA_AGENT_TOOL_WATCHDOG, default 2h). MULTICA_AGENT_TIMEOUT is an opt-in absolute cap (default 0 = no cap). The server-side 2.5h sweeper is unchanged as a coarse backstop. Fixes #3745.
This commit is contained in:
@@ -168,7 +168,7 @@ Daemon behavior is configured via flags or environment variables:
|
||||
|---------|------|--------------|---------|
|
||||
| Poll interval | `--poll-interval` | `MULTICA_DAEMON_POLL_INTERVAL` | `3s` |
|
||||
| Heartbeat interval | `--heartbeat-interval` | `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` |
|
||||
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `2h` |
|
||||
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `0` (no cap; bounded by the watchdogs) |
|
||||
| Codex semantic inactivity timeout | `--codex-semantic-inactivity-timeout` | `MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT` | `10m` |
|
||||
| Max concurrent tasks | `--max-concurrent-tasks` | `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` |
|
||||
| Daemon ID | `--daemon-id` | `MULTICA_DAEMON_ID` | hostname |
|
||||
|
||||
@@ -115,7 +115,7 @@ Daemon behavior is configured via flags or environment variables:
|
||||
|---------|------|--------------|---------|
|
||||
| Poll interval | `--poll-interval` | `MULTICA_DAEMON_POLL_INTERVAL` | `3s` |
|
||||
| Heartbeat interval | `--heartbeat-interval` | `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` |
|
||||
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `2h` |
|
||||
| Agent timeout | `--agent-timeout` | `MULTICA_AGENT_TIMEOUT` | `0`(不限制,由看门狗兜底)|
|
||||
| Max concurrent tasks | `--max-concurrent-tasks` | `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` |
|
||||
| Daemon ID | `--daemon-id` | `MULTICA_DAEMON_ID` | hostname |
|
||||
| Device name | `--device-name` | `MULTICA_DAEMON_DEVICE_NAME` | hostname |
|
||||
|
||||
@@ -179,6 +179,9 @@ API 返回的 `download_url` 在未配置 CloudFront 签名时会指向 `GET /ap
|
||||
| `MULTICA_DAEMON_HEARTBEAT_INTERVAL` | `15s` | 心跳频率 |
|
||||
| `MULTICA_DAEMON_POLL_INTERVAL` | `3s` | 任务轮询频率 |
|
||||
| `MULTICA_DAEMON_MAX_CONCURRENT_TASKS` | `20` | 并发任务上限 |
|
||||
| `MULTICA_AGENT_TIMEOUT` | `0` | 单次任务的绝对墙钟上限;`0` = 不设上限,任务只受看门狗约束(活跃任务不会因为跑得久被杀)。想要硬性成本/资源天花板时再设一个正值 |
|
||||
| `MULTICA_AGENT_IDLE_WATCHDOG` | `30m` | 空闲看门狗:backend 持续静默(无消息、消息队列为空、且没有工具在途)这么久就 force-stop。`0` = 关闭整套看门狗 |
|
||||
| `MULTICA_AGENT_TOOL_WATCHDOG` | `2h` | 工具在途时的静默上限:某个工具调用发出后长时间无任何输出(疑似卡死的子进程)这么久就 force-stop。`0` = 关闭该兜底(在途工具永不被停)|
|
||||
| `MULTICA_<PROVIDER>_PATH` | 对应 CLI 名 | 各 AI 编程工具的可执行文件路径(如 `MULTICA_CLAUDE_PATH`)|
|
||||
| `MULTICA_<PROVIDER>_MODEL` | 空 | 各 AI 编程工具的默认模型 |
|
||||
|
||||
|
||||
@@ -42,6 +42,8 @@ Multica 服务器每 30 秒扫描一次,有两种超时会触发失败:
|
||||
|
||||
两种超时的失败原因都是 `timeout`,**会自动重试**(下一节)。关联的运行时失联判定见 [守护进程与运行时 → 运行时什么时候被判定为离线](/daemon-runtimes#运行时什么时候被判定为离线)。
|
||||
|
||||
上面这层是**服务端的粗粒度兜底**——按任务启动时间算,不看任务是否还在活动。真正区分「卡死」和「正常的长任务」的是**本地守护进程**:它不再用固定墙钟时长砍任务(`MULTICA_AGENT_TIMEOUT` 默认 `0` = 不设上限),而是看活动——只要 agent 还在持续产出事件(消息、工具调用),守护进程就不会因为跑得久判它超时(服务端那条 2.5h 仍是外层上限)。只有真正静默卡死时才会被**空闲看门狗**(`MULTICA_AGENT_IDLE_WATCHDOG`,默认 30 分钟)终止;如果是某个工具调用发出后长时间无任何输出(疑似卡死的子进程),则由更大的**工具看门狗**预算(`MULTICA_AGENT_TOOL_WATCHDOG`,默认 2 小时)兜底。这类被看门狗终止的任务失败原因是 `idle_watchdog`,和墙钟 `timeout` 区分开。各参数见 [环境变量 → 守护进程的调节参数](/environment-variables#守护进程的调节参数)。
|
||||
|
||||
## 哪些失败会自动重试,哪些不会
|
||||
|
||||
失败分两类:**可重试**和**不可重试**。
|
||||
|
||||
@@ -78,7 +78,7 @@ func init() {
|
||||
f.String("runtime-name", "", "Runtime display name (env: MULTICA_AGENT_RUNTIME_NAME)")
|
||||
f.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)")
|
||||
f.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)")
|
||||
f.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)")
|
||||
f.Duration("agent-timeout", 0, "Absolute per-task wall-clock cap; 0 = no cap, rely on the watchdogs (env: MULTICA_AGENT_TIMEOUT)")
|
||||
f.Duration("codex-semantic-inactivity-timeout", 0, "Codex semantic inactivity timeout (env: MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT)")
|
||||
f.Int("max-concurrent-tasks", 0, "Max tasks running in parallel (env: MULTICA_DAEMON_MAX_CONCURRENT_TASKS)")
|
||||
f.Bool("no-auto-update", false, "Disable periodic CLI self-update (env: MULTICA_DAEMON_AUTO_UPDATE=false)")
|
||||
@@ -97,7 +97,7 @@ func init() {
|
||||
rf.String("runtime-name", "", "Runtime display name (env: MULTICA_AGENT_RUNTIME_NAME)")
|
||||
rf.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)")
|
||||
rf.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)")
|
||||
rf.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)")
|
||||
rf.Duration("agent-timeout", 0, "Absolute per-task wall-clock cap; 0 = no cap, rely on the watchdogs (env: MULTICA_AGENT_TIMEOUT)")
|
||||
rf.Duration("codex-semantic-inactivity-timeout", 0, "Codex semantic inactivity timeout (env: MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT)")
|
||||
rf.Int("max-concurrent-tasks", 0, "Max tasks running in parallel (env: MULTICA_DAEMON_MAX_CONCURRENT_TASKS)")
|
||||
rf.Bool("no-auto-update", false, "Disable periodic CLI self-update (env: MULTICA_DAEMON_AUTO_UPDATE=false)")
|
||||
@@ -284,7 +284,10 @@ func buildDaemonStartArgs(cmd *cobra.Command) []string {
|
||||
if d, _ := cmd.Flags().GetDuration("heartbeat-interval"); d > 0 {
|
||||
args = append(args, "--heartbeat-interval", d.String())
|
||||
}
|
||||
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
|
||||
// Forward agent-timeout when explicitly set, including an explicit 0
|
||||
// (= no cap), so it can override an environment MULTICA_AGENT_TIMEOUT.
|
||||
if cmd.Flags().Changed("agent-timeout") {
|
||||
d, _ := cmd.Flags().GetDuration("agent-timeout")
|
||||
args = append(args, "--agent-timeout", d.String())
|
||||
}
|
||||
if d, _ := cmd.Flags().GetDuration("codex-semantic-inactivity-timeout"); d > 0 {
|
||||
@@ -336,8 +339,11 @@ func runDaemonForeground(cmd *cobra.Command) error {
|
||||
if d, _ := cmd.Flags().GetDuration("heartbeat-interval"); d > 0 {
|
||||
overrides.HeartbeatInterval = d
|
||||
}
|
||||
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
|
||||
overrides.AgentTimeout = d
|
||||
// Distinguish "flag not passed" from an explicit `--agent-timeout 0` so a
|
||||
// user can turn off an env-configured cap from the CLI.
|
||||
if cmd.Flags().Changed("agent-timeout") {
|
||||
d, _ := cmd.Flags().GetDuration("agent-timeout")
|
||||
overrides.AgentTimeout = &d
|
||||
}
|
||||
if d, _ := cmd.Flags().GetDuration("codex-semantic-inactivity-timeout"); d > 0 {
|
||||
overrides.CodexSemanticInactivityTimeout = d
|
||||
|
||||
@@ -36,8 +36,12 @@ const (
|
||||
// The dispatched→running transition should be near-instant, so 5 minutes
|
||||
// means something went wrong (e.g. StartTask API call failed silently).
|
||||
dispatchTimeoutSeconds = 300.0
|
||||
// runningTimeoutSeconds fails tasks stuck in 'running' beyond this.
|
||||
// The default agent timeout is 2h, so 2.5h gives a generous buffer.
|
||||
// runningTimeoutSeconds fails tasks stuck in 'running' beyond this. It is a
|
||||
// coarse server-side backstop keyed on started_at (it does NOT look at task
|
||||
// activity) — mainly for runs whose daemon died without reporting. The
|
||||
// daemon itself decides stuck-vs-long-running by activity (idle/tool
|
||||
// watchdog), so this only needs to sit generously above any realistic single
|
||||
// run rather than track a per-run wall-clock cap (MUL-3064).
|
||||
runningTimeoutSeconds = 9000.0
|
||||
// queuedTTLSeconds expires tasks that have been sitting in 'queued'
|
||||
// for longer than this without ever being claimed. This is the cleanup
|
||||
@@ -46,9 +50,8 @@ const (
|
||||
// tasks already on the queue when a runtime drops off (or that lost
|
||||
// the race against a runtime that went offline mid-tick) need a
|
||||
// time-bounded exit. 2 hours is conservatively above any reasonable
|
||||
// "queued behind a long-running task" window for an online runtime
|
||||
// (default agent timeout is 2h, sweeper interval is 30s) so we don't
|
||||
// expire legitimately-pending work, while still draining the historical
|
||||
// "queued behind a long-running task" window for an online runtime, so we
|
||||
// don't expire legitimately-pending work, while still draining the historical
|
||||
// 87k autopilot backlog within ~24h once enabled.
|
||||
queuedTTLSeconds = 2 * 3600.0
|
||||
// queuedExpireBatchSize caps how many queued rows a single sweeper tick
|
||||
|
||||
@@ -15,23 +15,39 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultServerURL = "ws://localhost:8080/ws"
|
||||
DefaultPollInterval = 30 * time.Second
|
||||
DefaultHeartbeatInterval = 15 * time.Second
|
||||
DefaultAgentTimeout = 2 * time.Hour
|
||||
DefaultServerURL = "ws://localhost:8080/ws"
|
||||
DefaultPollInterval = 30 * time.Second
|
||||
DefaultHeartbeatInterval = 15 * time.Second
|
||||
// DefaultAgentTimeout is the optional absolute wall-clock cap on a single
|
||||
// agent run. 0 = no cap: a run is bounded only by the inactivity watchdogs
|
||||
// (DefaultAgentIdleWatchdog / DefaultAgentToolWatchdog), so a session that keeps emitting events is
|
||||
// never killed merely for running long (MUL-3064). Operators who want a
|
||||
// hard ceiling for cost/resource control can set MULTICA_AGENT_TIMEOUT.
|
||||
DefaultAgentTimeout = 0
|
||||
DefaultCodexSemanticInactivityTimeout = 10 * time.Minute
|
||||
// DefaultAgentIdleWatchdog is the per-task safety net that force-stops a
|
||||
// run when the backend has emitted no message for this long AND its
|
||||
// message queue is empty. Backends like Claude Code can hang indefinitely
|
||||
// on a stuck child process (e.g. `docker ps` against a frozen dockerd),
|
||||
// in which case `cmd.Wait()` never returns and the task sits at "running"
|
||||
// for its full DefaultAgentTimeout (2 h). The previous 5 min default
|
||||
// in which case `cmd.Wait()` never returns. With no wall-clock cap
|
||||
// (DefaultAgentTimeout = 0) such a run would otherwise sit at "running"
|
||||
// forever, so this watchdog is its sole liveness net. The previous 5 min default
|
||||
// killed legitimate long assistant outputs (e.g. RFC-length writeups)
|
||||
// where the model streams a single message for many minutes without any
|
||||
// daemon-visible activity — see MUL-2300. 30 min keeps the safety net for
|
||||
// truly stuck runs (dockerd hang) while leaving headroom for long writes.
|
||||
// Set MULTICA_AGENT_IDLE_WATCHDOG=0 to disable.
|
||||
DefaultAgentIdleWatchdog = 30 * time.Minute
|
||||
DefaultAgentIdleWatchdog = 30 * time.Minute
|
||||
// DefaultAgentToolWatchdog bounds how long a single tool call may stay in
|
||||
// flight (tool_use emitted, no tool_result and no other message) before the
|
||||
// idle watchdog force-stops the run. The idle watchdog ignores its normal
|
||||
// window while a tool is in flight, because a real build/install/test
|
||||
// legitimately runs silently for many minutes — but with no wall-clock cap
|
||||
// (DefaultAgentTimeout = 0) a backend that emits tool_use and never the
|
||||
// matching tool_result would otherwise run forever. This is the backstop for
|
||||
// that stuck-tool case (MUL-3064). Set MULTICA_AGENT_TOOL_WATCHDOG=0 to
|
||||
// disable, in which case an in-flight tool never force-stops the run.
|
||||
DefaultAgentToolWatchdog = 2 * time.Hour
|
||||
DefaultRuntimeName = "Local Agent"
|
||||
DefaultWorkspaceSyncInterval = 30 * time.Second
|
||||
DefaultHealthPort = 19514
|
||||
@@ -79,6 +95,7 @@ type Config struct {
|
||||
AgentTimeout time.Duration
|
||||
CodexSemanticInactivityTimeout time.Duration
|
||||
AgentIdleWatchdog time.Duration // force-stop a run when the backend goes silent this long with an empty queue (0 = disabled)
|
||||
AgentToolWatchdog time.Duration // force-stop a run when a single tool call stays in flight (silent) this long (0 = disabled); backstop for hung tools now that there is no wall-clock cap
|
||||
ClaudeArgs []string
|
||||
CodexArgs []string
|
||||
}
|
||||
@@ -86,11 +103,13 @@ type Config struct {
|
||||
// Overrides allows CLI flags to override environment variables and defaults.
|
||||
// Zero values are ignored and the env/default value is used instead.
|
||||
type Overrides struct {
|
||||
ServerURL string
|
||||
WorkspacesRoot string
|
||||
PollInterval time.Duration
|
||||
HeartbeatInterval time.Duration
|
||||
AgentTimeout time.Duration
|
||||
ServerURL string
|
||||
WorkspacesRoot string
|
||||
PollInterval time.Duration
|
||||
HeartbeatInterval time.Duration
|
||||
// AgentTimeout is a pointer so an explicit `--agent-timeout 0` (no cap) is
|
||||
// distinguishable from "flag not passed". nil = use env/default.
|
||||
AgentTimeout *time.Duration
|
||||
CodexSemanticInactivityTimeout time.Duration
|
||||
MaxConcurrentTasks int
|
||||
DaemonID string
|
||||
@@ -260,8 +279,8 @@ func LoadConfig(overrides Overrides) (Config, error) {
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
if overrides.AgentTimeout > 0 {
|
||||
agentTimeout = overrides.AgentTimeout
|
||||
if overrides.AgentTimeout != nil {
|
||||
agentTimeout = *overrides.AgentTimeout
|
||||
}
|
||||
|
||||
codexSemanticInactivityTimeout, err := durationFromEnv("MULTICA_CODEX_SEMANTIC_INACTIVITY_TIMEOUT", DefaultCodexSemanticInactivityTimeout)
|
||||
@@ -280,6 +299,13 @@ func LoadConfig(overrides Overrides) (Config, error) {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
// MULTICA_AGENT_TOOL_WATCHDOG=0 disables the in-flight-tool backstop; any
|
||||
// positive duration overrides DefaultAgentToolWatchdog.
|
||||
agentToolWatchdog, err := durationFromEnv("MULTICA_AGENT_TOOL_WATCHDOG", DefaultAgentToolWatchdog)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
|
||||
maxConcurrentTasks, err := intFromEnv("MULTICA_DAEMON_MAX_CONCURRENT_TASKS", DefaultMaxConcurrentTasks)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
@@ -428,6 +454,7 @@ func LoadConfig(overrides Overrides) (Config, error) {
|
||||
AgentTimeout: agentTimeout,
|
||||
CodexSemanticInactivityTimeout: codexSemanticInactivityTimeout,
|
||||
AgentIdleWatchdog: agentIdleWatchdog,
|
||||
AgentToolWatchdog: agentToolWatchdog,
|
||||
ClaudeArgs: claudeArgs,
|
||||
CodexArgs: codexArgs,
|
||||
}, nil
|
||||
|
||||
@@ -3087,15 +3087,21 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
|
||||
}
|
||||
taskLog.Debug("backend started, draining messages")
|
||||
|
||||
// Create an independent drain deadline so we don't block forever if the
|
||||
// backend's internal timeout fails to produce a Result (e.g. scanner
|
||||
// stuck on a hung stdout pipe). The extra 30 s gives the backend time
|
||||
// to clean up after its own timeout fires.
|
||||
drainTimeout := opts.Timeout + 30*time.Second
|
||||
if opts.Timeout == 0 {
|
||||
drainTimeout = 21 * time.Minute
|
||||
// Bound the drain loop only when there is a wall-clock cap. With a positive
|
||||
// opts.Timeout, give the drain a slightly longer deadline than the backend
|
||||
// so it can still collect the backend's own timeout Result if the scanner
|
||||
// is stuck on a hung stdout pipe (the extra 30 s covers cleanup after the
|
||||
// backend's own deadline fires). With no cap (opts.Timeout <= 0) the
|
||||
// inactivity watchdog is the only liveness net, so the drain must NOT
|
||||
// impose its own deadline either — otherwise an actively streaming long run
|
||||
// would be cut off here regardless of progress (MUL-3064).
|
||||
var drainCtx context.Context
|
||||
var drainCancel context.CancelFunc
|
||||
if opts.Timeout > 0 {
|
||||
drainCtx, drainCancel = context.WithTimeout(agentCtx, opts.Timeout+30*time.Second)
|
||||
} else {
|
||||
drainCtx, drainCancel = context.WithCancel(agentCtx)
|
||||
}
|
||||
drainCtx, drainCancel := context.WithTimeout(agentCtx, drainTimeout)
|
||||
defer drainCancel()
|
||||
|
||||
var toolCount atomic.Int32
|
||||
@@ -3110,12 +3116,18 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
|
||||
// with a matching tool_result. A non-zero count means the agent is
|
||||
// legitimately waiting on a tool (e.g. `npm install`, `docker build`)
|
||||
// that may run far longer than the idle window without emitting any
|
||||
// message — so the watchdog must not interpret that silence as a hang.
|
||||
// message — so while a tool is in flight the watchdog applies the larger
|
||||
// AgentToolWatchdog budget instead of treating that silence as a hang.
|
||||
var inFlightTools atomic.Int32
|
||||
var idleWatchdogFired atomic.Bool
|
||||
// idleWatchdogThreshold records (as nanos) which silence budget actually
|
||||
// tripped the watchdog — the idle window or the larger in-flight-tool
|
||||
// window — so the failure message reports the real duration.
|
||||
var idleWatchdogThreshold atomic.Int64
|
||||
idleWatchdogThreshold.Store(int64(d.cfg.AgentIdleWatchdog))
|
||||
idleWindow := d.cfg.AgentIdleWatchdog
|
||||
if idleWindow > 0 {
|
||||
go d.runIdleWatchdog(agentCtx, idleWindow, &lastActivityAt, &inFlightTools, &idleWatchdogFired, agentCancel, session.Messages, taskLog, taskID)
|
||||
go d.runIdleWatchdog(agentCtx, idleWindow, d.cfg.AgentToolWatchdog, &lastActivityAt, &inFlightTools, &idleWatchdogFired, &idleWatchdogThreshold, agentCancel, session.Messages, taskLog, taskID)
|
||||
}
|
||||
|
||||
go func() {
|
||||
@@ -3302,7 +3314,7 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
|
||||
// generic "agent_error" bucket the aborted path falls into.
|
||||
result.Status = "idle_watchdog"
|
||||
if result.Error == "" {
|
||||
result.Error = idleWatchdogReason(idleWindow)
|
||||
result.Error = idleWatchdogReason(time.Duration(idleWatchdogThreshold.Load()))
|
||||
}
|
||||
}
|
||||
return result, toolCount.Load(), nil
|
||||
@@ -3314,7 +3326,7 @@ func (d *Daemon) executeAndDrain(ctx context.Context, backend agent.Backend, pro
|
||||
if idleWatchdogFired.Load() {
|
||||
return agent.Result{
|
||||
Status: "idle_watchdog",
|
||||
Error: idleWatchdogReason(idleWindow),
|
||||
Error: idleWatchdogReason(time.Duration(idleWatchdogThreshold.Load())),
|
||||
}, toolCount.Load(), nil
|
||||
}
|
||||
// Distinguish external cancellation (e.g. server-initiated cancel
|
||||
@@ -3343,24 +3355,28 @@ func idleWatchdogReason(window time.Duration) string {
|
||||
}
|
||||
|
||||
// runIdleWatchdog ticks until either agentCtx is cancelled or the backend has
|
||||
// been silent for at least window with no in-flight tool call. On firing, it
|
||||
// sets fired and calls cancel, which propagates to the agent subprocess (via
|
||||
// the ctx passed to backend.Execute) and to drainCtx. The check requires:
|
||||
// been silent past the applicable budget. On firing, it records the tripped
|
||||
// threshold, sets fired, and calls cancel, which propagates to the agent
|
||||
// subprocess (via the ctx passed to backend.Execute) and to drainCtx. The
|
||||
// silence budget depends on whether a tool call is in flight:
|
||||
//
|
||||
// 1. inFlightTools == 0 — the backend has emitted a tool_use whose
|
||||
// matching tool_result hasn't arrived yet, meaning a real tool (e.g.
|
||||
// `npm install`, `docker build`) is legitimately running. Long tool
|
||||
// calls produce no messages between use and result; killing here would
|
||||
// yank the agent mid-build. AND
|
||||
// 2. time since lastActivityAt exceeds window — the drain loop is single
|
||||
// reader, so a stale stamp means no message has actually arrived; AND
|
||||
// 3. session.Messages buffer is empty — defensive against a hypothetical
|
||||
// drain stall where unprocessed messages would still imply progress.
|
||||
// 1. No tool in flight — a silent backend is a hang after `window`.
|
||||
// 2. A tool in flight (tool_use with no matching tool_result yet) — a real
|
||||
// tool (e.g. `npm install`, `docker build`) legitimately runs silently for
|
||||
// many minutes, so the larger `toolWindow` applies instead. toolWindow <= 0
|
||||
// keeps the historical behavior of never force-stopping while a tool is in
|
||||
// flight. Without this in-flight budget a backend that emits tool_use and
|
||||
// never the matching tool_result would run forever now that there is no
|
||||
// wall-clock cap (MUL-3064).
|
||||
//
|
||||
// In both cases the watchdog also requires the session.Messages buffer to be
|
||||
// empty — a buffered-but-undrained message means the drain loop is behind, not
|
||||
// the backend.
|
||||
//
|
||||
// Tick interval is window/2 (floored at 30 s in production, but the floor only
|
||||
// kicks in for windows >= 1 min so tests can pass tiny windows like 50 ms and
|
||||
// see the watchdog fire within a few ticks).
|
||||
func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration, lastActivityAt *atomic.Int64, inFlightTools *atomic.Int32, fired *atomic.Bool, cancel context.CancelFunc, messages <-chan agent.Message, taskLog *slog.Logger, taskID string) {
|
||||
func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window, toolWindow time.Duration, lastActivityAt *atomic.Int64, inFlightTools *atomic.Int32, fired *atomic.Bool, firedThreshold *atomic.Int64, cancel context.CancelFunc, messages <-chan agent.Message, taskLog *slog.Logger, taskID string) {
|
||||
interval := window / 2
|
||||
if window >= time.Minute && interval < 30*time.Second {
|
||||
interval = 30 * time.Second
|
||||
@@ -3375,16 +3391,21 @@ func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration,
|
||||
case <-agentCtx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
// In-flight tool call: the agent has emitted tool_use and
|
||||
// the corresponding tool_result hasn't landed yet. A long
|
||||
// build/install/test can sit here silently for many minutes
|
||||
// — that is forward progress, not a hang.
|
||||
if inFlightTools.Load() > 0 {
|
||||
continue
|
||||
// Pick the silence budget. A tool in flight is expected to be
|
||||
// silent (a long build/install/test emits nothing between
|
||||
// tool_use and tool_result), so it gets the larger toolWindow;
|
||||
// toolWindow <= 0 disables the in-flight bound entirely.
|
||||
threshold := window
|
||||
toolInFlight := inFlightTools.Load() > 0
|
||||
if toolInFlight {
|
||||
if toolWindow <= 0 {
|
||||
continue
|
||||
}
|
||||
threshold = toolWindow
|
||||
}
|
||||
last := time.Unix(0, lastActivityAt.Load())
|
||||
idleFor := time.Since(last)
|
||||
if idleFor < window {
|
||||
if idleFor < threshold {
|
||||
continue
|
||||
}
|
||||
// A buffered-but-undrained message means the drain loop is
|
||||
@@ -3396,8 +3417,10 @@ func (d *Daemon) runIdleWatchdog(agentCtx context.Context, window time.Duration,
|
||||
taskLog.Warn("idle watchdog firing: no agent activity, force-stopping run",
|
||||
"task", shortID(taskID),
|
||||
"idle_for", idleFor.Round(time.Second).String(),
|
||||
"threshold", window.String(),
|
||||
"threshold", threshold.String(),
|
||||
"tool_in_flight", toolInFlight,
|
||||
)
|
||||
firedThreshold.Store(int64(threshold))
|
||||
fired.Store(true)
|
||||
cancel()
|
||||
return
|
||||
|
||||
@@ -1096,8 +1096,9 @@ func TestExecuteAndDrain_ContextCancelled_ReportsCancelled(t *testing.T) {
|
||||
|
||||
// idleWatchdogBackend simulates the MUL-2225 hang: emit one message to mark
|
||||
// activity, then go silent forever. With a short AgentIdleWatchdog, the
|
||||
// watchdog should fire and short-circuit executeAndDrain instead of waiting
|
||||
// for the full drainTimeout (which is ~21 minutes by default).
|
||||
// watchdog should fire and short-circuit executeAndDrain. With no wall-clock
|
||||
// cap (opts.Timeout = 0) the drain loop imposes no deadline of its own, so the
|
||||
// idle watchdog is the only thing that ends this otherwise-forever-silent run.
|
||||
type idleWatchdogBackend struct {
|
||||
emitOne bool // when true, emit one message before going silent; when false, never emit anything
|
||||
}
|
||||
@@ -1285,6 +1286,45 @@ func TestExecuteAndDrain_IdleWatchdog_DoesNotFireDuringInFlightToolCall(t *testi
|
||||
}
|
||||
}
|
||||
|
||||
// stuckInFlightToolBackend models a hung tool: it emits a tool_use and then
|
||||
// goes silent forever — the matching tool_result never arrives, so inFlightTools
|
||||
// stays at 1 (e.g. a child process that never returns). With no wall-clock cap
|
||||
// (the MUL-3064 default), AgentToolWatchdog is the only thing that ends it.
|
||||
type stuckInFlightToolBackend struct{}
|
||||
|
||||
func (stuckInFlightToolBackend) Execute(_ context.Context, _ string, _ agent.ExecOptions) (*agent.Session, error) {
|
||||
msgCh := make(chan agent.Message, 2)
|
||||
resCh := make(chan agent.Result)
|
||||
msgCh <- agent.Message{Type: agent.MessageToolUse, Tool: "Bash", CallID: "c1"}
|
||||
// Deliberately leave msgCh open, never emit tool_result, never write resCh.
|
||||
return &agent.Session{Messages: msgCh, Result: resCh}, nil
|
||||
}
|
||||
|
||||
func TestExecuteAndDrain_IdleWatchdog_FiresOnStuckInFlightTool(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
d := newTestDaemon(t)
|
||||
// The normal idle window would be skipped while a tool is in flight; the
|
||||
// AgentToolWatchdog budget is what must fire here.
|
||||
d.cfg.AgentIdleWatchdog = 50 * time.Millisecond
|
||||
d.cfg.AgentToolWatchdog = 50 * time.Millisecond
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancel)
|
||||
|
||||
start := time.Now()
|
||||
result, _, err := d.executeAndDrain(ctx, stuckInFlightToolBackend{}, "p", agent.ExecOptions{}, slog.Default(), "t-stuck-tool")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if result.Status != "idle_watchdog" {
|
||||
t.Fatalf("expected status=idle_watchdog for a hung in-flight tool, got %q (err=%q)", result.Status, result.Error)
|
||||
}
|
||||
if elapsed := time.Since(start); elapsed > 2*time.Second {
|
||||
t.Fatalf("tool watchdog took too long to fire: %s (window=%s)", elapsed, d.cfg.AgentToolWatchdog)
|
||||
}
|
||||
}
|
||||
|
||||
// tailIdleAfterToolBackend exercises the boundary case: a tool call completes,
|
||||
// and THEN the backend goes silent without ever finishing. After the
|
||||
// tool_result lands, in-flight count returns to zero and lastActivityAt is
|
||||
|
||||
@@ -47,6 +47,19 @@ type ExecOptions struct {
|
||||
ThinkingLevel string
|
||||
}
|
||||
|
||||
// runContext derives the execution context for an agent subprocess from the
|
||||
// configured per-run timeout. A positive timeout imposes a hard wall-clock
|
||||
// deadline; a zero (or negative) timeout imposes NO deadline, leaving liveness
|
||||
// entirely to the daemon's inactivity watchdog so a session that keeps emitting
|
||||
// events is never killed merely for running long (MUL-3064). The caller owns
|
||||
// the returned CancelFunc and must call it to release resources.
|
||||
func runContext(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc) {
|
||||
if timeout > 0 {
|
||||
return context.WithTimeout(ctx, timeout)
|
||||
}
|
||||
return context.WithCancel(ctx)
|
||||
}
|
||||
|
||||
// Session represents a running agent execution.
|
||||
type Session struct {
|
||||
// Messages streams events as the agent works. The channel is closed
|
||||
|
||||
@@ -3,6 +3,7 @@ package agent
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestNewReturnsClaudeBackend(t *testing.T) {
|
||||
@@ -98,3 +99,37 @@ func TestLaunchHeaderReturnsEmptyForUnknownType(t *testing.T) {
|
||||
t.Errorf("expected empty header for unknown type, got %q", header)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunContextZeroTimeoutHasNoDeadline(t *testing.T) {
|
||||
t.Parallel()
|
||||
// A zero (or negative) timeout must NOT impose a wall-clock deadline:
|
||||
// liveness is delegated to the daemon's inactivity watchdog so an actively
|
||||
// streaming long-running session is never killed merely for running long
|
||||
// (MUL-3064).
|
||||
for _, d := range []time.Duration{0, -time.Second} {
|
||||
ctx, cancel := runContext(context.Background(), d)
|
||||
if _, ok := ctx.Deadline(); ok {
|
||||
cancel()
|
||||
t.Fatalf("runContext(%s) imposed a deadline; want none", d)
|
||||
}
|
||||
cancel()
|
||||
if ctx.Err() == nil {
|
||||
t.Fatalf("runContext(%s): context should be cancelled after cancel()", d)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunContextPositiveTimeoutHasDeadline(t *testing.T) {
|
||||
t.Parallel()
|
||||
// A positive timeout keeps the hard wall-clock deadline (the opt-in
|
||||
// absolute cap operators can still set via MULTICA_AGENT_TIMEOUT).
|
||||
ctx, cancel := runContext(context.Background(), time.Hour)
|
||||
defer cancel()
|
||||
deadline, ok := ctx.Deadline()
|
||||
if !ok {
|
||||
t.Fatal("runContext(1h) should impose a deadline")
|
||||
}
|
||||
if remaining := time.Until(deadline); remaining <= 0 || remaining > time.Hour+time.Minute {
|
||||
t.Fatalf("unexpected deadline remaining: %s", remaining)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,10 +39,7 @@ func (b *antigravityBackend) Execute(ctx context.Context, prompt string, opts Ex
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
logFile, err := os.CreateTemp("", "multica-agy-log-*.log")
|
||||
if err != nil {
|
||||
@@ -216,9 +213,16 @@ func buildAntigravityArgs(prompt, logPath string, timeout time.Duration, opts Ex
|
||||
args := []string{
|
||||
"-p", prompt,
|
||||
"--dangerously-skip-permissions",
|
||||
"--print-timeout", antigravityFormatTimeout(timeout),
|
||||
"--log-file", logPath,
|
||||
}
|
||||
// Only pass --print-timeout when a positive wall-clock cap is configured.
|
||||
// timeout <= 0 means "no cap" (MUL-3064): agy then runs without its own
|
||||
// print-timeout guillotine, matching every other backend's runContext
|
||||
// semantics. Passing antigravityFormatTimeout(0) would clamp to 1s and kill
|
||||
// the run almost immediately — the opposite of "no cap".
|
||||
if timeout > 0 {
|
||||
args = append(args, "--print-timeout", antigravityFormatTimeout(timeout))
|
||||
}
|
||||
args = append(args, "--log-file", logPath)
|
||||
if opts.ResumeSessionID != "" {
|
||||
args = append(args, "--conversation", opts.ResumeSessionID)
|
||||
}
|
||||
|
||||
@@ -38,6 +38,34 @@ func TestBuildAntigravityArgsBasic(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildAntigravityArgsNoTimeoutOmitsPrintTimeout(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// timeout <= 0 means "no wall-clock cap" (MUL-3064): agy must be launched
|
||||
// WITHOUT --print-timeout, otherwise antigravityFormatTimeout(0) clamps to
|
||||
// 1s and the run is killed almost immediately — the opposite of "no cap".
|
||||
args := buildAntigravityArgs(
|
||||
"hello",
|
||||
"/tmp/agy.log",
|
||||
0,
|
||||
ExecOptions{Cwd: "/work"},
|
||||
quietAntigravityLogger(),
|
||||
)
|
||||
|
||||
want := []string{
|
||||
"-p", "hello",
|
||||
"--dangerously-skip-permissions",
|
||||
"--log-file", "/tmp/agy.log",
|
||||
"--add-dir", "/work",
|
||||
}
|
||||
if !slices.Equal(args, want) {
|
||||
t.Fatalf("buildAntigravityArgs(timeout=0) mismatch\n got: %v\nwant: %v", args, want)
|
||||
}
|
||||
if slices.Contains(args, "--print-timeout") {
|
||||
t.Fatalf("--print-timeout must be omitted when timeout <= 0; got %v", args)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildAntigravityArgsResume(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -30,10 +30,7 @@ func (b *claudeBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
args := buildClaudeArgs(opts, b.cfg.Logger)
|
||||
|
||||
|
||||
@@ -499,14 +499,11 @@ func (b *codexBackend) Execute(ctx context.Context, prompt string, opts ExecOpti
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
semanticInactivityTimeout := opts.SemanticInactivityTimeout
|
||||
if semanticInactivityTimeout == 0 {
|
||||
semanticInactivityTimeout = defaultCodexSemanticInactivityTimeout
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
// Materialise the agent's MCP config into the per-task
|
||||
// `$CODEX_HOME/config.toml`. Argv would be the simpler path, but
|
||||
|
||||
@@ -203,10 +203,7 @@ func (b *copilotBackend) Execute(ctx context.Context, prompt string, opts ExecOp
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
args := buildCopilotArgs(prompt, opts, b.cfg.Logger)
|
||||
argv0, cmdArgs := chooseCopilotInvocation(execName, lookedUp, args, b.cfg.Logger)
|
||||
|
||||
@@ -31,10 +31,7 @@ func (b *cursorBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
args := buildCursorArgs(prompt, opts, b.cfg.Logger)
|
||||
argv0, cmdArgs := chooseCursorInvocation(execName, lookedUp, args, b.cfg.Logger)
|
||||
|
||||
@@ -27,10 +27,7 @@ func (b *geminiBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
args := buildGeminiArgs(prompt, opts, b.cfg.Logger)
|
||||
|
||||
|
||||
@@ -55,10 +55,7 @@ func (b *hermesBackend) Execute(ctx context.Context, prompt string, opts ExecOpt
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
hermesArgs := append([]string{"acp"}, filterCustomArgs(opts.CustomArgs, hermesBlockedArgs, b.cfg.Logger)...)
|
||||
cmd := exec.CommandContext(runCtx, execPath, hermesArgs...)
|
||||
|
||||
@@ -49,10 +49,7 @@ func (b *kimiBackend) Execute(ctx context.Context, prompt string, opts ExecOptio
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
// `kimi acp` ignores --yolo / --auto-approve (they're flags on the
|
||||
// root `kimi` command, not on the `acp` subcommand). Instead, the
|
||||
|
||||
@@ -54,10 +54,7 @@ func (b *kiroBackend) Execute(ctx context.Context, prompt string, opts ExecOptio
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
kiroArgs := append([]string{"acp", "--trust-all-tools"}, filterCustomArgs(opts.CustomArgs, kiroBlockedArgs, b.cfg.Logger)...)
|
||||
cmd := exec.CommandContext(runCtx, execPath, kiroArgs...)
|
||||
|
||||
@@ -66,10 +66,7 @@ func (b *openclawBackend) Execute(ctx context.Context, prompt string, opts ExecO
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
sessionID := opts.ResumeSessionID
|
||||
if sessionID == "" {
|
||||
|
||||
@@ -47,10 +47,7 @@ func (b *opencodeBackend) Execute(ctx context.Context, prompt string, opts ExecO
|
||||
execPath = resolved
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
args := []string{"run", "--format", "json", "--dangerously-skip-permissions"}
|
||||
// Anchor OpenCode's project discovery (AGENTS.md walk-up + .opencode/skills/
|
||||
|
||||
@@ -184,9 +184,6 @@ func (b *piBackend) Execute(ctx context.Context, prompt string, opts ExecOptions
|
||||
}
|
||||
|
||||
timeout := opts.Timeout
|
||||
if timeout == 0 {
|
||||
timeout = 20 * time.Minute
|
||||
}
|
||||
|
||||
// Pi's --session flag expects a file path where events are appended.
|
||||
// The path doubles as our opaque session identifier: we return it as
|
||||
@@ -203,7 +200,7 @@ func (b *piBackend) Execute(ctx context.Context, prompt string, opts ExecOptions
|
||||
return nil, fmt.Errorf("pi session file: %w", err)
|
||||
}
|
||||
|
||||
runCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
runCtx, cancel := runContext(ctx, timeout)
|
||||
|
||||
args := buildPiArgs(prompt, sessionPath, opts, b.cfg.Logger)
|
||||
argv0, cmdArgs := choosePiInvocation(execName, lookedUp, args, b.cfg.Logger)
|
||||
|
||||
Reference in New Issue
Block a user