mirror of
https://github.com/multica-ai/multica.git
synced 2026-06-17 03:38:32 +02:00
fix(agent/codex): shut down gracefully so OTEL telemetry flushes (#3888)
Codex telemetry was never reaching the OTLP collector for tasks run by the daemon. The per-task config (including the [otel] block) is copied into CODEX_HOME correctly, but the lifecycle goroutine closed stdin and then immediately cancelled the run context, which SIGKILLs the app-server. Codex's OTEL batch exporters only force-flush on a graceful shutdown, so the buffered spans/metrics/logs were dropped before they could be exported — short tasks lost everything, long tasks lost the final batch. Let codex exit on its own after stdin EOF (running its shutdown + flush path) and only force-cancel after a bounded grace period if it doesn't, so the reader goroutine still can't block forever. Also set cmd.WaitDelay, matching the other long-lived backends (claude, copilot, cursor, …). Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -38,6 +38,13 @@ const (
|
||||
defaultCodexSemanticInactivityTimeout = 10 * time.Minute
|
||||
defaultCodexFirstTurnNoProgressTimeout = 30 * time.Second
|
||||
codexVersionDiagnosticTimeout = 2 * time.Second
|
||||
// codexGracefulShutdownTimeout bounds how long the lifecycle goroutine
|
||||
// waits for codex to exit on its own after stdin is closed, before forcing
|
||||
// a context-cancel kill. A clean exit lets codex run its shutdown path and
|
||||
// flush buffered telemetry — OTEL batch exporters only force-flush on
|
||||
// graceful shutdown, so killing it immediately (the prior behavior) drops
|
||||
// the task's spans/metrics/logs.
|
||||
codexGracefulShutdownTimeout = 10 * time.Second
|
||||
)
|
||||
|
||||
// CodexSemanticInactivityMarker prefixes timeout errors emitted when Codex
|
||||
@@ -537,6 +544,10 @@ func (b *codexBackend) Execute(ctx context.Context, prompt string, opts ExecOpti
|
||||
codexArgs := buildCodexArgs(opts, b.cfg.Logger)
|
||||
cmd := exec.CommandContext(runCtx, execPath, codexArgs...)
|
||||
hideAgentWindow(cmd)
|
||||
// Bound the wait after the context is cancelled so a stuck child (or an
|
||||
// open pipe held by a grandchild) can't hang cmd.Wait() forever. Matches
|
||||
// the other long-lived backends (claude, copilot, cursor, …).
|
||||
cmd.WaitDelay = 10 * time.Second
|
||||
b.cfg.Logger.Info("agent command", "exec", execPath, "args", codexArgs)
|
||||
if opts.Cwd != "" {
|
||||
cmd.Dir = opts.Cwd
|
||||
@@ -806,14 +817,24 @@ func (b *codexBackend) Execute(ctx context.Context, prompt string, opts ExecOpti
|
||||
duration := time.Since(startTime)
|
||||
b.cfg.Logger.Info("codex finished", "pid", cmd.Process.Pid, "status", finalStatus, "duration", duration.Round(time.Millisecond).String())
|
||||
|
||||
// Close stdin and cancel context to signal the app-server to exit.
|
||||
// Without this, the long-running codex process keeps stdout open and
|
||||
// the reader goroutine blocks forever on scanner.Scan().
|
||||
// Close stdin to signal the app-server to exit. Prefer letting codex
|
||||
// shut down on its own: a clean exit runs codex's shutdown path, which
|
||||
// force-flushes its OTEL batch exporters — killing it immediately (via
|
||||
// cancel → SIGKILL) drops the task's buffered telemetry. Give it a
|
||||
// bounded grace period; only force-cancel if it doesn't exit, so the
|
||||
// reader goroutine can never block forever on scanner.Scan().
|
||||
stdin.Close()
|
||||
cancel()
|
||||
|
||||
// Wait for the reader goroutine to finish so all output is accumulated.
|
||||
<-readerDone
|
||||
select {
|
||||
case <-readerDone:
|
||||
// codex closed stdout on its own — clean shutdown, telemetry flushed.
|
||||
case <-time.After(codexGracefulShutdownTimeout):
|
||||
b.cfg.Logger.Warn("codex did not exit after stdin close; forcing shutdown",
|
||||
"pid", cmd.Process.Pid,
|
||||
"grace", codexGracefulShutdownTimeout.String(),
|
||||
)
|
||||
cancel()
|
||||
<-readerDone
|
||||
}
|
||||
drainAndWait()
|
||||
|
||||
if timeoutDiagnostic.Kind != codexTimeoutNone {
|
||||
|
||||
Reference in New Issue
Block a user