fix(chat): require online availability before flipping pill to stuck

Without this gate, a slow presence query (which chat-window surfaces as `availability === undefined` precisely so callers don't speculate) would flip the pill to "Daemon not responding" 30s in, accusing a possibly healthy daemon. Restrict the stuck stage to affirmative online evidence; when presence is loading, hold the queued label. Co-authored-by: multica-agent <github@multica.ai>
fix(chat): differentiate stuck-queue states from a healthy queue
2026-06-17 19:59:20 +02:00 · 2026-05-10 14:45:44 +08:00 · 2026-05-10 14:40:38 +08:00
5 changed files with 152 additions and 9 deletions
--- a/packages/views/chat/components/offline-banner.tsx
+++ b/packages/views/chat/components/offline-banner.tsx
@@ -38,11 +38,18 @@ export function OfflineBanner({ agentName, availability }: Props) {
  }
  return (
    <div className="px-5 mb-1.5">
-      <div className="mx-auto flex w-full max-w-4xl items-center gap-1.5 rounded-md px-2.5 py-1.5 text-xs bg-muted text-muted-foreground ring-1 ring-border">
+      <div className="mx-auto flex w-full max-w-4xl flex-wrap items-center gap-x-1.5 gap-y-0.5 rounded-md px-2.5 py-1.5 text-xs bg-muted text-muted-foreground ring-1 ring-border">
        <WifiOff className="size-3.5 shrink-0" />
        <span className="truncate">
          {t(($) => $.offline_banner.offline, { name })}
        </span>
+        <span className="ml-auto truncate">
+          {t(($) => $.offline_banner.diagnose_hint_prefix)}
+          <code className="rounded bg-background/60 px-1 py-0.5 font-mono text-[10px]">
+            multica daemon logs -f
+          </code>
+          {t(($) => $.offline_banner.diagnose_hint_suffix)}
+        </span>
      </div>
    </div>
  );
--- a/packages/views/chat/components/task-status-pill.test.ts
+++ b/packages/views/chat/components/task-status-pill.test.ts
@@ -0,0 +1,99 @@
+import { describe, expect, it } from "vitest";
+import type { TaskMessagePayload } from "@multica/core/types";
+import { pickStageKeys } from "./task-status-pill";
+
+const NO_MSGS: readonly TaskMessagePayload[] = [];
+
+describe("pickStageKeys", () => {
+  describe("queued / dispatched + presence", () => {
+    it("offline + queued → static offline label (unambiguous runtime-down state)", () => {
+      expect(pickStageKeys("queued", NO_MSGS, "offline", 5)).toEqual({
+        stageKey: "offline",
+        static: true,
+      });
+    });
+
+    it("offline + dispatched → static offline (same runtime-down treatment)", () => {
+      expect(pickStageKeys("dispatched", NO_MSGS, "offline", 5)).toEqual({
+        stageKey: "offline",
+        static: true,
+      });
+    });
+
+    it("unstable + queued → reconnecting (transient amber state, not stuck)", () => {
+      expect(pickStageKeys("queued", NO_MSGS, "unstable", 5)).toEqual({
+        stageKey: "reconnecting",
+      });
+    });
+  });
+
+  describe("stuck-detection while runtime appears online", () => {
+    // Reproduction of the GH #2341 footgun: backend has not yet swept the
+    // dead daemon, so availability is "online" while the task sits queued
+    // forever. The 30s threshold gives the user a diagnostic cue well
+    // before the backend's ~150s sweep window expires.
+
+    it("queued + online + elapsed < 30s → normal queued (brief queueing is healthy)", () => {
+      expect(pickStageKeys("queued", NO_MSGS, "online", 5)).toEqual({
+        stageKey: "queued",
+      });
+    });
+
+    it("queued + online + elapsed exactly 30s → flips to static stuck", () => {
+      expect(pickStageKeys("queued", NO_MSGS, "online", 30)).toEqual({
+        stageKey: "stuck",
+        static: true,
+      });
+    });
+
+    it("dispatched + online + elapsed > 30s → static stuck (daemon claimed but never started)", () => {
+      expect(pickStageKeys("dispatched", NO_MSGS, "online", 60)).toEqual({
+        stageKey: "stuck",
+        static: true,
+      });
+    });
+
+    it("queued + undefined availability + elapsed > 30s → stays queued (don't speculate while presence is loading)", () => {
+      // chat-window passes `undefined` precisely so we DON'T render
+      // speculative availability copy. "Stuck" is a diagnosis — it needs
+      // affirmative evidence the runtime is online, otherwise a slow
+      // presence query would falsely accuse a healthy daemon.
+      expect(pickStageKeys("queued", NO_MSGS, undefined, 45)).toEqual({
+        stageKey: "queued",
+      });
+    });
+
+    it("offline always wins over stuck (clearer copy + the stuck label would be redundant)", () => {
+      // Even when elapsed is well past the stuck threshold, an offline
+      // runtime gets the "Runtime offline" label — it's a more specific
+      // diagnosis than the generic stuck cue.
+      expect(pickStageKeys("queued", NO_MSGS, "offline", 120)).toEqual({
+        stageKey: "offline",
+        static: true,
+      });
+    });
+  });
+
+  describe("running stage decisions are unaffected by elapsed", () => {
+    it("running + no messages → thinking", () => {
+      expect(pickStageKeys("running", NO_MSGS, "online", 5)).toEqual({
+        stageKey: "thinking",
+      });
+    });
+
+    it("running + text message → typing (and the stuck threshold doesn't fire)", () => {
+      const msgs: TaskMessagePayload[] = [
+        {
+          task_id: "t1",
+          issue_id: "",
+          seq: 1,
+          type: "text",
+          content: "hi",
+        },
+      ];
+      expect(pickStageKeys("running", msgs, "online", 999)).toEqual({
+        stageKey: "typing",
+      });
+    });
+  });
+});
--- a/packages/views/chat/components/task-status-pill.tsx
+++ b/packages/views/chat/components/task-status-pill.tsx
@@ -26,10 +26,19 @@ type StageKey =
  | "offline"
  | "reconnecting"
  | "queued"
+  | "stuck"
  | "starting_up"
  | "thinking"
  | "typing";

+// After this many seconds with the task still queued/dispatched and the
+// runtime appearing online, we treat the wait as genuinely stuck. The
+// backend's runtime-sweep gap (~150s after a daemon dies before
+// runtime.status flips to offline) means a task can spend its whole life
+// "queued · online" while the daemon is actually dead — the user should
+// see a diagnostic cue well before that 150s window expires.
+const STUCK_THRESHOLD_SECS = 30;
+
 type ToolKey =
  | "running_command"
  | "reading_files"
@@ -56,10 +65,11 @@ const TOOL_KEY_BY_SLUG: Record<string, Exclude<ToolKey, "fallback">> = {
 // Pure stage decision returning translation keys. The hook below maps these
 // keys into localized labels — keeping the decision pure makes it easy to
 // follow the priority rules without translation noise.
-function pickStageKeys(
+export function pickStageKeys(
  status: string | undefined,
  taskMessages: readonly TaskMessagePayload[],
  availability: AgentAvailability | undefined,
+  elapsedSecs: number,
 ): { stageKey: StageKey; toolKey?: ToolKey; static?: boolean } {
  if (
    (status === "queued" || status === "dispatched") &&
@@ -73,6 +83,26 @@ function pickStageKeys(
  ) {
    return { stageKey: "reconnecting" };
  }
+  // Queued / dispatched too long while the runtime still appears online.
+  // The backend-reported "online" status lags up to the runtime-sweeper's
+  // ~150s window, so this state legitimately means "daemon is heartbeating
+  // (or recently was) but isn't picking up the task". A static label flagged
+  // as stuck gives the user something to act on instead of an unbounded
+  // "queued · 90s · 120s · …" timer.
+  //
+  // Gated on `availability === "online"` (NOT `!== "offline"`): when
+  // presence is still loading or temporarily unavailable, chat-window
+  // surfaces it as `undefined` precisely so we don't speculate about
+  // reachability. Treating undefined as "stuck" would slap a "Daemon not
+  // responding" diagnosis onto users whose runtime might be perfectly
+  // healthy but whose presence query is slow.
+  if (
+    (status === "queued" || status === "dispatched") &&
+    availability === "online" &&
+    elapsedSecs >= STUCK_THRESHOLD_SECS
+  ) {
+    return { stageKey: "stuck", static: true };
+  }
  if (status === "queued") return { stageKey: "queued" };
  if (status === "dispatched") return { stageKey: "starting_up" };

@@ -103,10 +133,11 @@ function useResolveStage(): (
  status: string | undefined,
  taskMessages: readonly TaskMessagePayload[],
  availability: AgentAvailability | undefined,
+  elapsedSecs: number,
 ) => Stage {
  const { t } = useT("chat");
-  return (status, taskMessages, availability) => {
-    const decision = pickStageKeys(status, taskMessages, availability);
+  return (status, taskMessages, availability, elapsedSecs) => {
+    const decision = pickStageKeys(status, taskMessages, availability, elapsedSecs);
    if (decision.toolKey) {
      return {
        label: t(($) => $.status_pill.tools[decision.toolKey!]),
@@ -151,7 +182,7 @@ export function TaskStatusPill({
  // running; we trust that observation over a stale cache.
  const status = taskMessages.length > 0 ? "running" : pendingTask.status;
  const elapsedSecs = Math.max(0, Math.floor((now - anchor) / 1000));
-  const stage = resolveStage(status, taskMessages, availability);
+  const stage = resolveStage(status, taskMessages, availability, elapsedSecs);

  return (
    <div
--- a/packages/views/locales/en/chat.json
+++ b/packages/views/locales/en/chat.json
@@ -92,13 +92,16 @@
  "offline_banner": {
    "fallback_name": "the agent",
    "unstable": "{{name}}'s connection is unstable — replies may be delayed.",
-    "offline": "{{name}} is offline — your message will be delivered when they're back."
+    "offline": "{{name}} is offline — your message will be delivered when they're back.",
+    "diagnose_hint_prefix": "Local daemon down? Run ",
+    "diagnose_hint_suffix": " to check."
  },
  "status_pill": {
    "stages": {
-      "offline": "Offline",
+      "offline": "Runtime offline",
      "reconnecting": "Reconnecting",
      "queued": "Queued",
+      "stuck": "Daemon not responding",
      "starting_up": "Starting up",
      "thinking": "Thinking",
      "typing": "Typing"
--- a/packages/views/locales/zh-Hans/chat.json
+++ b/packages/views/locales/zh-Hans/chat.json
@@ -88,13 +88,16 @@
  "offline_banner": {
    "fallback_name": "智能体",
    "unstable": "{{name}} 的连接不稳定——回复可能延迟。",
-    "offline": "{{name}} 离线——你的消息将在它上线后发送。"
+    "offline": "{{name}} 离线——你的消息将在它上线后发送。",
+    "diagnose_hint_prefix": "本地 daemon 没起来？运行 ",
+    "diagnose_hint_suffix": " 排查。"
  },
  "status_pill": {
    "stages": {
-      "offline": "离线",
+      "offline": "Runtime 离线",
      "reconnecting": "重连中",
      "queued": "排队中",
+      "stuck": "Daemon 无响应",
      "starting_up": "启动中",
      "thinking": "思考中",
      "typing": "输入中"