From 254ec945f5792e9aecfeb1768038c930fbebe4db Mon Sep 17 00:00:00 2001 From: elrrrrrrr Date: Tue, 9 Jun 2026 14:46:07 +0800 Subject: [PATCH] fix(agent/codex): shut down gracefully so OTEL telemetry flushes (#3888) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex telemetry was never reaching the OTLP collector for tasks run by the daemon. The per-task config (including the [otel] block) is copied into CODEX_HOME correctly, but the lifecycle goroutine closed stdin and then immediately cancelled the run context, which SIGKILLs the app-server. Codex's OTEL batch exporters only force-flush on a graceful shutdown, so the buffered spans/metrics/logs were dropped before they could be exported — short tasks lost everything, long tasks lost the final batch. Let codex exit on its own after stdin EOF (running its shutdown + flush path) and only force-cancel after a bounded grace period if it doesn't, so the reader goroutine still can't block forever. Also set cmd.WaitDelay, matching the other long-lived backends (claude, copilot, cursor, …). Co-authored-by: Claude Opus 4.8 (1M context) --- server/pkg/agent/codex.go | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/server/pkg/agent/codex.go b/server/pkg/agent/codex.go index 893dc7b50..5ad965793 100644 --- a/server/pkg/agent/codex.go +++ b/server/pkg/agent/codex.go @@ -38,6 +38,13 @@ const ( defaultCodexSemanticInactivityTimeout = 10 * time.Minute defaultCodexFirstTurnNoProgressTimeout = 30 * time.Second codexVersionDiagnosticTimeout = 2 * time.Second + // codexGracefulShutdownTimeout bounds how long the lifecycle goroutine + // waits for codex to exit on its own after stdin is closed, before forcing + // a context-cancel kill. A clean exit lets codex run its shutdown path and + // flush buffered telemetry — OTEL batch exporters only force-flush on + // graceful shutdown, so killing it immediately (the prior behavior) drops + // the task's spans/metrics/logs. + codexGracefulShutdownTimeout = 10 * time.Second ) // CodexSemanticInactivityMarker prefixes timeout errors emitted when Codex @@ -537,6 +544,10 @@ func (b *codexBackend) Execute(ctx context.Context, prompt string, opts ExecOpti codexArgs := buildCodexArgs(opts, b.cfg.Logger) cmd := exec.CommandContext(runCtx, execPath, codexArgs...) hideAgentWindow(cmd) + // Bound the wait after the context is cancelled so a stuck child (or an + // open pipe held by a grandchild) can't hang cmd.Wait() forever. Matches + // the other long-lived backends (claude, copilot, cursor, …). + cmd.WaitDelay = 10 * time.Second b.cfg.Logger.Info("agent command", "exec", execPath, "args", codexArgs) if opts.Cwd != "" { cmd.Dir = opts.Cwd @@ -806,14 +817,24 @@ func (b *codexBackend) Execute(ctx context.Context, prompt string, opts ExecOpti duration := time.Since(startTime) b.cfg.Logger.Info("codex finished", "pid", cmd.Process.Pid, "status", finalStatus, "duration", duration.Round(time.Millisecond).String()) - // Close stdin and cancel context to signal the app-server to exit. - // Without this, the long-running codex process keeps stdout open and - // the reader goroutine blocks forever on scanner.Scan(). + // Close stdin to signal the app-server to exit. Prefer letting codex + // shut down on its own: a clean exit runs codex's shutdown path, which + // force-flushes its OTEL batch exporters — killing it immediately (via + // cancel → SIGKILL) drops the task's buffered telemetry. Give it a + // bounded grace period; only force-cancel if it doesn't exit, so the + // reader goroutine can never block forever on scanner.Scan(). stdin.Close() - cancel() - - // Wait for the reader goroutine to finish so all output is accumulated. - <-readerDone + select { + case <-readerDone: + // codex closed stdout on its own — clean shutdown, telemetry flushed. + case <-time.After(codexGracefulShutdownTimeout): + b.cfg.Logger.Warn("codex did not exit after stdin close; forcing shutdown", + "pid", cmd.Process.Pid, + "grace", codexGracefulShutdownTimeout.String(), + ) + cancel() + <-readerDone + } drainAndWait() if timeoutDiagnostic.Kind != codexTimeoutNone {