Compare commits

...

2 Commits

Author SHA1 Message Date
Jiang Bohan
989c11ed7c fix(execenv): keep idempotent codex-home self-heal on reuse
Eve's review on #1147 correctly pointed out that prepareCodexHome is
already idempotent — skipping it on reuse removed the self-heal path
for broken auth.json / sessions symlinks and missing config.toml
without fixing any real bug (perceived 're-init' was superficial,
state was never being churned).

Revert the skip-on-exists branch so reuse continues to repair a
partially corrupt codex-home. Document the actual semantics in code
and in CLI_AND_DAEMON.md: content is preserved (user edits, rollouts,
per-session cache), links and missing scaffolding are repaired.

Replace the misleading preservation test with two tighter ones:
- TestReuseIsIdempotentForCodexHome locks in that user edits and
  accumulated rollouts survive reuse.
- TestReuseHealsCorruptedCodexHome covers the regression Eve called
  out: reuse must still heal a codex-home that lost its sessions
  symlink, auth.json link, and config.toml.
2026-04-16 17:40:10 +08:00
Jiang Bohan
c994398181 fix(execenv): reuse codex CODEX_HOME across tasks on the same issue
Reuse() was re-running prepareCodexHome on every task, which churned
Codex's internal state (rollouts, per-session cache) and made users
feel the sandbox was re-initializing on each new comment. Skip the
reseed when the directory already exists; only seed from scratch when
it's missing (older envs or manual cleanup).

Also document the task isolation boundaries (workdir, session, CODEX_HOME)
in CLI_AND_DAEMON.md so the lifecycle is discoverable.

Closes #1136
2026-04-16 14:38:26 +08:00
3 changed files with 176 additions and 3 deletions

View File

@@ -154,6 +154,14 @@ You need at least one installed. The daemon registers each detected CLI as an av
4. Heartbeats are sent periodically (default: 15s) so the server knows the daemon is alive
5. On shutdown, all runtimes are deregistered
### Task Isolation
Each issue — not each task — owns a persistent workspace directory under `MULTICA_WORKSPACES_ROOT`.
- **workdir** (`{root}/{task-short-id}/workdir`) is created on the first task and reused by subsequent tasks on the same `(agent, issue)` pair via `PriorWorkDir`. Repo checkouts and local edits survive across comments.
- **Session** (Claude) is resumed via `PriorSessionID`, so the conversation context carries forward.
- **`CODEX_HOME`** (Codex) lives at `{root}/{task-short-id}/codex-home` alongside the workdir and is **reused across tasks on the same issue**. The daemon re-runs its seeding step on every reuse, but the step is idempotent: broken `auth.json` / `sessions` symlinks and a missing `config.toml` are repaired, while existing content — user edits to `config.toml`, Codex's rollouts, and the per-session cache — is left untouched. A fresh `CODEX_HOME` is created only when a brand-new issue starts.
### Configuration
Daemon behavior is configured via flags or environment variables:

View File

@@ -143,9 +143,16 @@ func Reuse(workDir, provider string, task TaskContextForEnv, logger *slog.Logger
logger.Warn("execenv: refresh context files failed", "error", err)
}
// Restore CodexHome for Codex provider — the per-task codex-home directory
// lives alongside the workdir. Re-run prepareCodexHome to ensure config
// (especially network access) is up to date.
// Restore CodexHome for Codex provider — the codex-home directory lives
// alongside the workdir and is reused across tasks on the same issue.
// prepareCodexHome is deliberately idempotent: it only (re)creates broken
// or missing symlinks (auth.json, sessions), copies config files when the
// destination is absent, and leaves existing content — including user
// edits to config.toml and Codex's accumulated rollouts / per-session
// cache — untouched. Running it on every reuse therefore gives us a cheap
// self-heal path when something in the env has been corrupted, without
// churning the state that makes the per-issue reuse useful in the first
// place.
if provider == "codex" {
codexHome := filepath.Join(env.RootDir, "codex-home")
if err := prepareCodexHome(codexHome, logger); err != nil {

View File

@@ -932,6 +932,164 @@ func TestReuseRestoresCodexHome(t *testing.T) {
}
}
// TestReuseIsIdempotentForCodexHome locks in the contract that Reuse() does
// not churn Codex state that accumulates across tasks on the same issue —
// user-level edits to config.toml, rollouts, per-session cache — even though
// prepareCodexHome runs on every reuse for self-healing.
func TestReuseIsIdempotentForCodexHome(t *testing.T) {
// Cannot use t.Parallel() with t.Setenv.
sharedHome := t.TempDir()
os.WriteFile(filepath.Join(sharedHome, "auth.json"), []byte(`{"token":"v1"}`), 0o644)
t.Setenv("CODEX_HOME", sharedHome)
workspacesRoot := t.TempDir()
env, err := Prepare(PrepareParams{
WorkspacesRoot: workspacesRoot,
WorkspaceID: "ws-codex-preserve",
TaskID: "f6a7b8c9-d0e1-2345-fabc-678901234567",
AgentName: "Codex Agent",
Provider: "codex",
Task: TaskContextForEnv{IssueID: "preserve-test"},
}, testLogger())
if err != nil {
t.Fatalf("Prepare failed: %v", err)
}
defer env.Cleanup(true)
// Simulate Codex-internal state accumulated during the first task
// (rollouts / per-session cache) and a user edit to config.toml.
sentinel := filepath.Join(env.CodexHome, "rollouts", "session.jsonl")
if err := os.MkdirAll(filepath.Dir(sentinel), 0o755); err != nil {
t.Fatalf("mkdir sentinel dir: %v", err)
}
if err := os.WriteFile(sentinel, []byte("session-1-data"), 0o644); err != nil {
t.Fatalf("write sentinel: %v", err)
}
configPath := filepath.Join(env.CodexHome, "config.toml")
configBefore, err := os.ReadFile(configPath)
if err != nil {
t.Fatalf("read config.toml: %v", err)
}
customConfig := string(configBefore) + "\nmodel_reasoning_effort = \"high\"\n"
if err := os.WriteFile(configPath, []byte(customConfig), 0o644); err != nil {
t.Fatalf("write config.toml: %v", err)
}
reused := Reuse(env.WorkDir, "codex", TaskContextForEnv{IssueID: "preserve-test"}, testLogger())
if reused == nil {
t.Fatal("Reuse returned nil")
}
if reused.CodexHome != env.CodexHome {
t.Errorf("CodexHome = %q, want %q", reused.CodexHome, env.CodexHome)
}
data, err := os.ReadFile(sentinel)
if err != nil {
t.Fatalf("sentinel gone after Reuse: %v", err)
}
if string(data) != "session-1-data" {
t.Errorf("sentinel content = %q, want %q", data, "session-1-data")
}
configAfter, err := os.ReadFile(configPath)
if err != nil {
t.Fatalf("read config.toml after reuse: %v", err)
}
if string(configAfter) != customConfig {
t.Errorf("config.toml was modified by Reuse; got:\n%s", configAfter)
}
}
// TestReuseHealsCorruptedCodexHome guards the self-heal contract: if an
// existing codex-home is partially corrupt (missing sessions symlink, broken
// auth.json link, missing config.toml) the next task claim must repair it
// instead of silently failing.
func TestReuseHealsCorruptedCodexHome(t *testing.T) {
// Cannot use t.Parallel() with t.Setenv.
sharedHome := t.TempDir()
os.WriteFile(filepath.Join(sharedHome, "auth.json"), []byte(`{"token":"secret"}`), 0o644)
t.Setenv("CODEX_HOME", sharedHome)
workspacesRoot := t.TempDir()
env, err := Prepare(PrepareParams{
WorkspacesRoot: workspacesRoot,
WorkspaceID: "ws-codex-heal",
TaskID: "a7b8c9d0-e1f2-3456-abcd-789012345678",
AgentName: "Codex Agent",
Provider: "codex",
Task: TaskContextForEnv{IssueID: "heal-test"},
}, testLogger())
if err != nil {
t.Fatalf("Prepare failed: %v", err)
}
defer env.Cleanup(true)
// Simulate partial corruption of the persisted codex-home:
// - sessions symlink removed
// - auth.json repointed at a non-existent path (broken link)
// - config.toml deleted
sessionsPath := filepath.Join(env.CodexHome, "sessions")
if err := os.Remove(sessionsPath); err != nil {
t.Fatalf("remove sessions: %v", err)
}
authPath := filepath.Join(env.CodexHome, "auth.json")
if err := os.Remove(authPath); err != nil {
t.Fatalf("remove auth.json: %v", err)
}
if err := os.Symlink(filepath.Join(sharedHome, "missing.json"), authPath); err != nil {
t.Fatalf("write broken auth.json symlink: %v", err)
}
configPath := filepath.Join(env.CodexHome, "config.toml")
if err := os.Remove(configPath); err != nil {
t.Fatalf("remove config.toml: %v", err)
}
reused := Reuse(env.WorkDir, "codex", TaskContextForEnv{IssueID: "heal-test"}, testLogger())
if reused == nil {
t.Fatal("Reuse returned nil")
}
if reused.CodexHome != env.CodexHome {
t.Errorf("CodexHome = %q, want %q", reused.CodexHome, env.CodexHome)
}
// sessions should be re-linked to the shared dir.
fi, err := os.Lstat(sessionsPath)
if err != nil {
t.Fatalf("sessions not restored: %v", err)
}
if fi.Mode()&os.ModeSymlink == 0 {
t.Error("sessions should be a symlink after heal")
}
if target, _ := os.Readlink(sessionsPath); target != filepath.Join(sharedHome, "sessions") {
t.Errorf("sessions target = %q, want %q", target, filepath.Join(sharedHome, "sessions"))
}
// auth.json should point at the real shared auth.json again.
if target, _ := os.Readlink(authPath); target != filepath.Join(sharedHome, "auth.json") {
t.Errorf("auth.json target = %q, want %q", target, filepath.Join(sharedHome, "auth.json"))
}
data, err := os.ReadFile(authPath)
if err != nil {
t.Fatalf("auth.json unreadable after heal: %v", err)
}
if string(data) != `{"token":"secret"}` {
t.Errorf("auth.json content = %q", data)
}
// config.toml should be recreated with network access enabled.
data, err = os.ReadFile(configPath)
if err != nil {
t.Fatalf("config.toml not restored: %v", err)
}
if !strings.Contains(string(data), "network_access = true") {
t.Errorf("restored config.toml missing network_access = true; got:\n%s", data)
}
}
func TestEnsureSymlinkRepairsBrokenLink(t *testing.T) {
t.Parallel()
dir := t.TempDir()