Compare commits

...

1 Commits

Author SHA1 Message Date
J
e9ccdf82a3 fix(agent): stop Antigravity turns dying at agy's hidden 5m print-timeout
agy's --print-timeout defaults to 5m when the flag is omitted, but the
daemon treated "omit the flag" as "no cap". In the default no-cap config
every Antigravity turn was therefore silently capped at 5 minutes: any
run whose build/tests outlived the budget had agy abort mid-turn, print
"Error: timed out waiting for response", and exit 0 — which the backend
recorded as a successful "completed" with truncated output (the reported
"Antigravity disconnects", MUL-3570 / #4453).

- Always pass --print-timeout: the configured cap when positive, else a
  large value (24h) that defers to the daemon's idle/tool watchdogs.
- Detect agy's print-mode timeout marker in the run log and surface the
  result as a timeout instead of a truncated success.

Verified by reproducing against agy 1.0.8 and with new unit + end-to-end
backend tests.

Co-authored-by: multica-agent <github@multica.ai>
2026-06-23 17:29:55 +08:00
2 changed files with 203 additions and 14 deletions

View File

@@ -146,6 +146,17 @@ func (b *antigravityBackend) Execute(ctx context.Context, prompt string, opts Ex
} else if waitErr != nil && finalStatus == "completed" {
finalStatus = "failed"
finalError = fmt.Sprintf("agy exited with error: %v", waitErr)
} else if finalStatus == "completed" && antigravityPrintTimedOut(logPath) {
// agy hit its own --print-timeout: it printed "Error: timed out
// waiting for response" to stdout and EXITED 0, so runCtx never
// tripped and waitErr is nil — the checks above leave the turn as
// "completed". Surface it as a real timeout instead of a truncated
// success the user can't distinguish from a finished task (MUL-3570).
finalStatus = "timeout"
finalError = fmt.Sprintf(
"agy print mode timed out after %s waiting for the agent response; a long-running command likely outlived --print-timeout",
antigravityPrintTimeout(timeout),
)
}
if finalError != "" {
finalError = withAgentStderr(finalError, "agy", stderrBuf.Tail())
@@ -179,6 +190,32 @@ var antigravityConversationIDRe = regexp.MustCompile(
`conversation=([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})`,
)
// antigravityPrintTimeoutRe matches the glog line agy's printmode.go writes when
// the print-mode wall-clock budget (--print-timeout) elapses before the agent
// produced a final response. agy then prints "Error: timed out waiting for
// response" to stdout and EXITS 0 — runCtx never trips and cmd.Wait returns nil
// — so without this signal the daemon would record the truncated turn as a
// successful "completed" (MUL-3570).
//
// Example: `E0623 17:17:59.017212 65926 printmode.go:289] Print mode: timed out
// after 100 polls (printed=3)`
var antigravityPrintTimeoutRe = regexp.MustCompile(`Print mode: timed out after \d+ polls`)
// antigravityPrintTimedOut reports whether the per-run log shows agy hit its own
// print-mode timeout. Best-effort: returns false if the log is missing or the
// marker format changes upstream, in which case the run is classified by its
// exit status as before.
func antigravityPrintTimedOut(logPath string) bool {
if logPath == "" {
return false
}
data, err := os.ReadFile(logPath)
if err != nil {
return false
}
return antigravityPrintTimeoutRe.Match(data)
}
// readAntigravityConversationID scans the per-run log file for the
// conversation UUID. Best-effort: returns "" if the log file is missing, the
// CLI exited before dispatching, or the format changes upstream.
@@ -245,14 +282,14 @@ func buildAntigravityArgs(prompt, logPath string, timeout time.Duration, opts Ex
if opts.Model != "" {
args = append(args, "--model", opts.Model)
}
// Only pass --print-timeout when a positive wall-clock cap is configured.
// timeout <= 0 means "no cap" (MUL-3064): agy then runs without its own
// print-timeout guillotine, matching every other backend's runContext
// semantics. Passing antigravityFormatTimeout(0) would clamp to 1s and kill
// the run almost immediately — the opposite of "no cap".
if timeout > 0 {
args = append(args, "--print-timeout", antigravityFormatTimeout(timeout))
}
// agy's --print-timeout has NO "disabled" value and DEFAULTS TO 5m when the
// flag is omitted, so "no cap" cannot be expressed by dropping it — that
// silently guillotines every turn at 5 minutes, killing any run whose build
// or tests outlive the budget (MUL-3570). Always pass the flag: the
// configured wall-clock cap when positive, else a value so large agy's own
// timeout never fires before the daemon's idle/tool watchdogs reclaim a
// genuinely stuck run (see antigravityPrintTimeout).
args = append(args, "--print-timeout", antigravityFormatTimeout(antigravityPrintTimeout(timeout)))
args = append(args, "--log-file", logPath)
if opts.ResumeSessionID != "" {
args = append(args, "--conversation", opts.ResumeSessionID)
@@ -290,6 +327,27 @@ func antigravityModelError(model string, available []Model) error {
)
}
// antigravityNoCapPrintTimeout is the --print-timeout value used when the daemon
// configures no wall-clock cap (opts.Timeout <= 0). agy's --print-timeout has no
// "disabled" sentinel and falls back to a 5-minute default when omitted, so "no
// cap" must instead be a value large enough that agy's own guillotine never
// fires before the daemon's idle (30m) / tool (2h) watchdogs reclaim a genuinely
// stuck run. 24h is effectively unbounded for any real turn while still being a
// finite duration agy can parse.
const antigravityNoCapPrintTimeout = 24 * time.Hour
// antigravityPrintTimeout resolves the wall-clock budget handed to agy's
// --print-timeout: the daemon's configured cap when positive, else the no-cap
// sentinel above. It is the single source of truth shared by
// buildAntigravityArgs (which sets the flag) and Execute (which labels a
// print-mode timeout).
func antigravityPrintTimeout(timeout time.Duration) time.Duration {
if timeout > 0 {
return timeout
}
return antigravityNoCapPrintTimeout
}
// antigravityFormatTimeout renders a Go duration in the `<n>m<n>s` shape the
// agy CLI accepts (e.g. 20m0s). Sub-second timeouts round up to 1s so the CLI
// doesn't reject the flag.

View File

@@ -1,6 +1,7 @@
package agent
import (
"context"
"io"
"log/slog"
"os"
@@ -71,12 +72,14 @@ func TestBuildAntigravityArgsModel(t *testing.T) {
}
}
func TestBuildAntigravityArgsNoTimeoutOmitsPrintTimeout(t *testing.T) {
func TestBuildAntigravityArgsNoCapUsesLargePrintTimeout(t *testing.T) {
t.Parallel()
// timeout <= 0 means "no wall-clock cap" (MUL-3064): agy must be launched
// WITHOUT --print-timeout, otherwise antigravityFormatTimeout(0) clamps to
// 1s and the run is killed almost immediately — the opposite of "no cap".
// timeout <= 0 means "no wall-clock cap", but agy's --print-timeout DEFAULTS
// to 5m when omitted, so dropping the flag silently caps every turn at 5
// minutes and kills any run whose build/tests outlive it (MUL-3570). "No cap"
// must therefore be expressed by passing a value large enough to defer to the
// daemon's idle/tool watchdogs — NOT by omitting the flag.
args := buildAntigravityArgs(
"hello",
"/tmp/agy.log",
@@ -88,14 +91,65 @@ func TestBuildAntigravityArgsNoTimeoutOmitsPrintTimeout(t *testing.T) {
want := []string{
"-p", "hello",
"--dangerously-skip-permissions",
"--print-timeout", antigravityFormatTimeout(antigravityNoCapPrintTimeout),
"--log-file", "/tmp/agy.log",
"--add-dir", "/work",
}
if !slices.Equal(args, want) {
t.Fatalf("buildAntigravityArgs(timeout=0) mismatch\n got: %v\nwant: %v", args, want)
}
if slices.Contains(args, "--print-timeout") {
t.Fatalf("--print-timeout must be omitted when timeout <= 0; got %v", args)
// The no-cap value must be well clear of agy's 5m default; otherwise the
// guillotine still fires on a routine build+test turn.
if antigravityNoCapPrintTimeout <= 5*time.Minute {
t.Fatalf("antigravityNoCapPrintTimeout %s must be far larger than agy's 5m default", antigravityNoCapPrintTimeout)
}
}
func TestAntigravityPrintTimeoutResolvesBudget(t *testing.T) {
t.Parallel()
if got := antigravityPrintTimeout(20 * time.Minute); got != 20*time.Minute {
t.Errorf("positive cap should pass through: got %s, want 20m", got)
}
if got := antigravityPrintTimeout(0); got != antigravityNoCapPrintTimeout {
t.Errorf("zero cap should resolve to no-cap sentinel: got %s, want %s", got, antigravityNoCapPrintTimeout)
}
if got := antigravityPrintTimeout(-1); got != antigravityNoCapPrintTimeout {
t.Errorf("negative cap should resolve to no-cap sentinel: got %s, want %s", got, antigravityNoCapPrintTimeout)
}
}
func TestAntigravityPrintTimedOut(t *testing.T) {
t.Parallel()
dir := t.TempDir()
timedOut := filepath.Join(dir, "timeout.log")
if err := os.WriteFile(timedOut, []byte(strings.Join([]string{
`I0623 17:17:38.930400 65926 printmode.go:156] Print mode: conversation=ea49cf41-4156-425a-a2f7-4238335d4c8b, sending message`,
`E0623 17:17:59.017212 65926 printmode.go:289] Print mode: timed out after 100 polls (printed=3)`,
}, "\n")), 0o644); err != nil {
t.Fatal(err)
}
if !antigravityPrintTimedOut(timedOut) {
t.Error("expected the print-mode timeout marker to be detected")
}
clean := filepath.Join(dir, "clean.log")
if err := os.WriteFile(clean, []byte(
`I0623 17:17:38.930400 65926 printmode.go:156] Print mode: conversation=abc, sending message`,
), 0o644); err != nil {
t.Fatal(err)
}
if antigravityPrintTimedOut(clean) {
t.Error("a clean log must not be flagged as timed out")
}
if antigravityPrintTimedOut("/nonexistent/path") {
t.Error("missing log file must be treated as not-timed-out")
}
if antigravityPrintTimedOut("") {
t.Error("empty log path must be treated as not-timed-out")
}
}
@@ -215,6 +269,83 @@ func TestReadAntigravityConversationIDMissingFile(t *testing.T) {
}
}
// fakeAgyPrintTimeoutScript returns a POSIX-sh script that impersonates `agy -p`
// hitting its own --print-timeout: it prints a couple of "I will ..." narration
// lines (as agy streams to stdout), writes the printmode.go "timed out after N
// polls" marker into the --log-file the daemon handed it, prints agy's
// user-facing "Error: timed out waiting for response" line, and EXITS 0 — exactly
// the sequence that made a stalled turn look "completed" (MUL-3570).
func fakeAgyPrintTimeoutScript() string {
return `#!/bin/sh
log=""
while [ $# -gt 0 ]; do
case "$1" in
--log-file) log="$2"; shift 2 ;;
*) shift ;;
esac
done
echo "I will run the Go unit tests to verify the build."
echo "I will wait for the Go unit tests to complete."
if [ -n "$log" ]; then
printf 'I0623 17:17:38.930400 1 printmode.go:156] Print mode: conversation=ea49cf41-4156-425a-a2f7-4238335d4c8b, sending message\n' >> "$log"
printf 'E0623 17:17:59.017212 1 printmode.go:289] Print mode: timed out after 100 polls (printed=2)\n' >> "$log"
fi
echo "Error: timed out waiting for response"
exit 0
`
}
// TestAntigravityBackendPrintTimeoutSurfacesAsTimeout is the end-to-end guard for
// MUL-3570: agy aborts a long turn by printing its timeout sentinel and exiting
// 0, so the backend must classify the result as a timeout (not a truncated
// "completed") while still preserving the narration printed before the cut-off.
func TestAntigravityBackendPrintTimeoutSurfacesAsTimeout(t *testing.T) {
t.Parallel()
fakePath := filepath.Join(t.TempDir(), "agy")
writeTestExecutable(t, fakePath, []byte(fakeAgyPrintTimeoutScript()))
backend, err := New("antigravity", Config{ExecutablePath: fakePath, Logger: quietAntigravityLogger()})
if err != nil {
t.Fatalf("new antigravity backend: %v", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
// Timeout: 0 ("no cap") so runContext never trips — the only signal that the
// turn died is agy's own print-timeout marker in the log.
session, err := backend.Execute(ctx, "prompt-ignored", ExecOptions{})
if err != nil {
t.Fatalf("execute: %v", err)
}
// Drain the message stream so the lifecycle goroutine can finish.
go func() {
for range session.Messages {
}
}()
select {
case result, ok := <-session.Result:
if !ok {
t.Fatal("result channel closed without a value")
}
if result.Status != "timeout" {
t.Fatalf("expected status=timeout, got %q (error=%q)", result.Status, result.Error)
}
if !strings.Contains(result.Error, "print mode timed out") {
t.Errorf("expected error to explain the print-mode timeout, got %q", result.Error)
}
// Narration streamed before the cut-off must still reach the result so
// the user sees how far the turn got.
if !strings.Contains(result.Output, "I will wait for the Go unit tests to complete") {
t.Errorf("expected partial narration to be preserved in output, got %q", result.Output)
}
case <-time.After(10 * time.Second):
t.Fatal("timeout waiting for result")
}
}
// TestAntigravityModelError is the regression guard for the silent-no-op fix:
// agy exits 0 with empty output on an unrecognised --model, so Execute must
// reject a non-empty model that isn't in the `agy models` catalog instead of