Compare commits

...

2 Commits

Author SHA1 Message Date
Jiang Bohan
cb70fbe796 fix(daemon/gc): address review — 404 safety + decimal/overflow in duration parser
Two issues flagged in PR review:

1. 404-immediate-clean is unsafe. The /gc-check endpoint returns 404 for
   both "issue deleted" AND "daemon token has no access to the workspace"
   (anti-enumeration, see requireDaemonWorkspaceAccess). Clean-on-404
   would let a scoped-down daemon token wipe taskDirs whose issues are
   still live. Restore the mtime gate against GCOrphanTTL. With the new
   72h default we still shrink the original 30d window dramatically
   without the cross-workspace hazard. Lock the behavior in with a new
   test that asserts a recent 404 is skipped.

2. parseFlexDuration mishandled decimals and swallowed Atoi errors:
   "0.5d" → 7m12s (regex matched only the "5d"), "1.5d" → 1h7m12s,
   and 20+ digit day values Atoi-errored silently to 0. Match the full
   decimal number with `\d*\.\d+|\d+` and parse with ParseFloat so
   fractional days and oversized inputs both go through
   time.ParseDuration correctly — fractions as sub-hour durations,
   overflow as a returned error.
2026-04-23 17:25:33 +08:00
Jiang Bohan
74a43f5bec feat(daemon/gc): tighten GC defaults + flex duration suffix
Driven by user feedback in #1539 (40 GB VPS filling within 24h of heavy
AI-coding usage): the existing TTLs were sized for desktop/laptop
deployments and are too lenient for small-disk, long-running daemons.

- GCTTL: 5d → 24h. Done/canceled issues almost never need a multi-day
  grace period in AI-coding workflows.
- GCOrphanTTL: 30d → 72h. Covers crash-leftover and pre-GC directories
  without a month-long wait.
- Issue-deleted orphans (API returns 404) are now cleaned on the next GC
  cycle regardless of mtime. The issue row is gone; there is nothing
  left to protect.
- parseFlexDuration: accept a `d` (day) suffix in addition to the stdlib
  time.ParseDuration syntax. MULTICA_GC_TTL=5d now works; previously only
  120h was accepted.
2026-04-23 17:04:02 +08:00
5 changed files with 118 additions and 8 deletions

View File

@@ -20,8 +20,8 @@ const (
DefaultHealthPort = 19514
DefaultMaxConcurrentTasks = 20
DefaultGCInterval = 1 * time.Hour
DefaultGCTTL = 5 * 24 * time.Hour // 5 days
DefaultGCOrphanTTL = 30 * 24 * time.Hour // 30 days
DefaultGCTTL = 24 * time.Hour // 1 day — AI-coding issues rarely stay open long
DefaultGCOrphanTTL = 72 * time.Hour // 3 days — orphans with no meta (crashes, pre-GC leftovers)
)
// Config holds all daemon configuration.
@@ -41,8 +41,8 @@ type Config struct {
MaxConcurrentTasks int // max tasks running in parallel (default: 20)
GCEnabled bool // enable periodic workspace garbage collection (default: true)
GCInterval time.Duration // how often the GC loop runs (default: 1h)
GCTTL time.Duration // clean dirs whose issue is done/canceled and updated_at < now()-TTL (default: 5d)
GCOrphanTTL time.Duration // clean orphan dirs (no meta or unknown issue) older than this (default: 30d)
GCTTL time.Duration // clean dirs whose issue is done/canceled and updated_at < now()-TTL (default: 24h)
GCOrphanTTL time.Duration // clean orphan dirs with no meta older than this (default: 72h). Dirs whose issue returned 404 are cleaned immediately.
PollInterval time.Duration
HeartbeatInterval time.Duration
AgentTimeout time.Duration

View File

@@ -140,13 +140,17 @@ func (d *Daemon) shouldCleanTaskDir(ctx context.Context, taskDir string) gcActio
if err != nil {
var reqErr *requestError
if errors.As(err, &reqErr) && reqErr.StatusCode == http.StatusNotFound {
// Issue deleted — check mtime for orphan cleanup.
// 404 is ambiguous: the server returns it for both "issue deleted"
// and "daemon token has no access to the workspace" (anti-enumeration,
// see requireDaemonWorkspaceAccess). Fall back to the mtime-gated
// orphan cleanup so a scoped-down token can't instantly wipe dirs
// whose issues are still live.
info, statErr := os.Stat(taskDir)
if statErr != nil {
return gcActionSkip
}
if time.Since(info.ModTime()) > d.cfg.GCOrphanTTL {
d.logger.Info("gc: orphan directory (issue deleted)", "dir", taskDir, "issue", meta.IssueID)
d.logger.Info("gc: orphan directory (issue not accessible)", "dir", taskDir, "issue", meta.IssueID)
return gcActionOrphan
}
}

View File

@@ -223,7 +223,35 @@ func TestShouldCleanTaskDir_Issue404OldOrphan(t *testing.T) {
action := d.shouldCleanTaskDir(context.Background(), taskDir)
if action != gcActionOrphan {
t.Fatalf("expected gcActionOrphan for deleted issue, got %d", action)
t.Fatalf("expected gcActionOrphan for unreachable issue past TTL, got %d", action)
}
}
// TestShouldCleanTaskDir_Issue404RecentSkipped locks in the cross-workspace
// safety: the server returns 404 both for deleted issues and for workspaces
// the daemon token can't see, so a recent 404 must NOT trigger immediate
// cleanup — otherwise a token re-scope could wipe dirs whose issues are live.
func TestShouldCleanTaskDir_Issue404RecentSkipped(t *testing.T) {
t.Parallel()
issueID := "66666666-6666-6666-6666-666666666667"
mux := http.NewServeMux()
mux.HandleFunc(fmt.Sprintf("/api/daemon/issues/%s/gc-check", issueID), func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
w.Write([]byte(`{"error":"not found"}`))
})
d := newGCTestDaemon(t, mux)
// Default production OrphanTTL; taskDir mtime is now, so it's fresh.
taskDir := createTaskDir(t, d.cfg.WorkspacesRoot, "ws1", "fresh-404", &execenv.GCMeta{
IssueID: issueID,
WorkspaceID: "ws1",
CompletedAt: time.Now(),
})
action := d.shouldCleanTaskDir(context.Background(), taskDir)
if action != gcActionSkip {
t.Fatalf("expected gcActionSkip for recent 404 (cross-workspace safety), got %d", action)
}
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"fmt"
"os"
"regexp"
"strconv"
"strings"
"time"
@@ -22,13 +23,38 @@ func durationFromEnv(key string, fallback time.Duration) (time.Duration, error)
if value == "" {
return fallback, nil
}
d, err := time.ParseDuration(value)
d, err := parseFlexDuration(value)
if err != nil {
return 0, fmt.Errorf("%s: invalid duration %q: %w", key, value, err)
}
return d, nil
}
// dayUnit matches a decimal number (with optional leading digits) followed by
// `d` (days), so both "5d" and "1.5d" are captured whole and expanded to hours.
var dayUnit = regexp.MustCompile(`(\d*\.\d+|\d+)d`)
// parseFlexDuration accepts the standard Go time.ParseDuration syntax plus a
// `d` (day) suffix, which the stdlib rejects. "5d" → 120h, "1d12h" → 36h,
// "0.5d" → 12h. Overflow or malformed numbers propagate as errors.
func parseFlexDuration(value string) (time.Duration, error) {
var convErr error
expanded := dayUnit.ReplaceAllStringFunc(value, func(match string) string {
days, err := strconv.ParseFloat(match[:len(match)-1], 64)
if err != nil {
convErr = err
return match
}
// time.ParseDuration handles fractional hours natively, and rejects
// overflow on its own.
return strconv.FormatFloat(days*24, 'f', -1, 64) + "h"
})
if convErr != nil {
return 0, convErr
}
return time.ParseDuration(expanded)
}
func intFromEnv(key string, fallback int) (int, error) {
value := strings.TrimSpace(os.Getenv(key))
if value == "" {

View File

@@ -0,0 +1,52 @@
package daemon
import (
"testing"
"time"
)
func TestParseFlexDuration(t *testing.T) {
t.Parallel()
cases := []struct {
in string
want time.Duration
}{
{"5d", 5 * 24 * time.Hour},
{"1d", 24 * time.Hour},
{"1d12h", 36 * time.Hour},
{"2d30m", 2*24*time.Hour + 30*time.Minute},
{"0.5d", 12 * time.Hour},
{"1.5d", 36 * time.Hour},
{".5d", 12 * time.Hour},
{"120h", 120 * time.Hour},
{"24h", 24 * time.Hour},
{"30m", 30 * time.Minute},
}
for _, tc := range cases {
got, err := parseFlexDuration(tc.in)
if err != nil {
t.Errorf("parseFlexDuration(%q) unexpected error: %v", tc.in, err)
continue
}
if got != tc.want {
t.Errorf("parseFlexDuration(%q) = %v, want %v", tc.in, got, tc.want)
}
}
}
func TestParseFlexDuration_Invalid(t *testing.T) {
t.Parallel()
for _, in := range []string{
"",
"xyz",
"5days",
"abc5d",
// Overflow: 30 digits is well past int64/float64 safe range; must error
// rather than silently produce 0h.
"999999999999999999999999999999d",
} {
if _, err := parseFlexDuration(in); err == nil {
t.Errorf("parseFlexDuration(%q) expected error, got nil", in)
}
}
}