Compare commits

...

2 Commits

Author SHA1 Message Date
Jiayuan Zhang
252b7a7f40 fix(chat): require online availability before flipping pill to stuck
Without this gate, a slow presence query (which chat-window surfaces as
`availability === undefined` precisely so callers don't speculate) would
flip the pill to "Daemon not responding" 30s in, accusing a possibly
healthy daemon. Restrict the stuck stage to affirmative online evidence;
when presence is loading, hold the queued label.

Co-authored-by: multica-agent <github@multica.ai>
2026-05-10 14:45:44 +08:00
Jiayuan Zhang
6fc9ce2724 fix(chat): differentiate stuck-queue states from a healthy queue
When a user's local daemon is offline or stops claiming tasks, the chat
StatusPill used to sit on "排队中" / "Queued" indefinitely with no clue
that anything was wrong (GH #2341). The runtime sweeper takes ~150s to
flip runtime.status, and even when it does the only diagnostic was a
single-word "Offline" label.

- Add a STUCK_THRESHOLD_SECS=30 escalation: queued/dispatched past 30s
  now renders a static "Daemon not responding" / "Daemon 无响应" stage
  even when availability still reports online — covers the sweeper-lag
  window and the heartbeating-but-not-claiming edge case.
- Rename the offline-stage label to "Runtime offline" / "Runtime 离线"
  so it points at the actual failure surface instead of the agent.
- Append a `multica daemon logs -f` diagnostic hint to the offline
  banner so users have a concrete next step.
- Extract pickStageKeys for testing and add unit coverage for the new
  stuck-detection branches and the "offline always wins" precedence.

Co-authored-by: multica-agent <github@multica.ai>
2026-05-10 14:40:38 +08:00
5 changed files with 152 additions and 9 deletions

View File

@@ -38,11 +38,18 @@ export function OfflineBanner({ agentName, availability }: Props) {
}
return (
<div className="px-5 mb-1.5">
<div className="mx-auto flex w-full max-w-4xl items-center gap-1.5 rounded-md px-2.5 py-1.5 text-xs bg-muted text-muted-foreground ring-1 ring-border">
<div className="mx-auto flex w-full max-w-4xl flex-wrap items-center gap-x-1.5 gap-y-0.5 rounded-md px-2.5 py-1.5 text-xs bg-muted text-muted-foreground ring-1 ring-border">
<WifiOff className="size-3.5 shrink-0" />
<span className="truncate">
{t(($) => $.offline_banner.offline, { name })}
</span>
<span className="ml-auto truncate">
{t(($) => $.offline_banner.diagnose_hint_prefix)}
<code className="rounded bg-background/60 px-1 py-0.5 font-mono text-[10px]">
multica daemon logs -f
</code>
{t(($) => $.offline_banner.diagnose_hint_suffix)}
</span>
</div>
</div>
);

View File

@@ -0,0 +1,99 @@
import { describe, expect, it } from "vitest";
import type { TaskMessagePayload } from "@multica/core/types";
import { pickStageKeys } from "./task-status-pill";
const NO_MSGS: readonly TaskMessagePayload[] = [];
describe("pickStageKeys", () => {
describe("queued / dispatched + presence", () => {
it("offline + queued → static offline label (unambiguous runtime-down state)", () => {
expect(pickStageKeys("queued", NO_MSGS, "offline", 5)).toEqual({
stageKey: "offline",
static: true,
});
});
it("offline + dispatched → static offline (same runtime-down treatment)", () => {
expect(pickStageKeys("dispatched", NO_MSGS, "offline", 5)).toEqual({
stageKey: "offline",
static: true,
});
});
it("unstable + queued → reconnecting (transient amber state, not stuck)", () => {
expect(pickStageKeys("queued", NO_MSGS, "unstable", 5)).toEqual({
stageKey: "reconnecting",
});
});
});
describe("stuck-detection while runtime appears online", () => {
// Reproduction of the GH #2341 footgun: backend has not yet swept the
// dead daemon, so availability is "online" while the task sits queued
// forever. The 30s threshold gives the user a diagnostic cue well
// before the backend's ~150s sweep window expires.
it("queued + online + elapsed < 30s → normal queued (brief queueing is healthy)", () => {
expect(pickStageKeys("queued", NO_MSGS, "online", 5)).toEqual({
stageKey: "queued",
});
});
it("queued + online + elapsed exactly 30s → flips to static stuck", () => {
expect(pickStageKeys("queued", NO_MSGS, "online", 30)).toEqual({
stageKey: "stuck",
static: true,
});
});
it("dispatched + online + elapsed > 30s → static stuck (daemon claimed but never started)", () => {
expect(pickStageKeys("dispatched", NO_MSGS, "online", 60)).toEqual({
stageKey: "stuck",
static: true,
});
});
it("queued + undefined availability + elapsed > 30s → stays queued (don't speculate while presence is loading)", () => {
// chat-window passes `undefined` precisely so we DON'T render
// speculative availability copy. "Stuck" is a diagnosis — it needs
// affirmative evidence the runtime is online, otherwise a slow
// presence query would falsely accuse a healthy daemon.
expect(pickStageKeys("queued", NO_MSGS, undefined, 45)).toEqual({
stageKey: "queued",
});
});
it("offline always wins over stuck (clearer copy + the stuck label would be redundant)", () => {
// Even when elapsed is well past the stuck threshold, an offline
// runtime gets the "Runtime offline" label — it's a more specific
// diagnosis than the generic stuck cue.
expect(pickStageKeys("queued", NO_MSGS, "offline", 120)).toEqual({
stageKey: "offline",
static: true,
});
});
});
describe("running stage decisions are unaffected by elapsed", () => {
it("running + no messages → thinking", () => {
expect(pickStageKeys("running", NO_MSGS, "online", 5)).toEqual({
stageKey: "thinking",
});
});
it("running + text message → typing (and the stuck threshold doesn't fire)", () => {
const msgs: TaskMessagePayload[] = [
{
task_id: "t1",
issue_id: "",
seq: 1,
type: "text",
content: "hi",
},
];
expect(pickStageKeys("running", msgs, "online", 999)).toEqual({
stageKey: "typing",
});
});
});
});

View File

@@ -26,10 +26,19 @@ type StageKey =
| "offline"
| "reconnecting"
| "queued"
| "stuck"
| "starting_up"
| "thinking"
| "typing";
// After this many seconds with the task still queued/dispatched and the
// runtime appearing online, we treat the wait as genuinely stuck. The
// backend's runtime-sweep gap (~150s after a daemon dies before
// runtime.status flips to offline) means a task can spend its whole life
// "queued · online" while the daemon is actually dead — the user should
// see a diagnostic cue well before that 150s window expires.
const STUCK_THRESHOLD_SECS = 30;
type ToolKey =
| "running_command"
| "reading_files"
@@ -56,10 +65,11 @@ const TOOL_KEY_BY_SLUG: Record<string, Exclude<ToolKey, "fallback">> = {
// Pure stage decision returning translation keys. The hook below maps these
// keys into localized labels — keeping the decision pure makes it easy to
// follow the priority rules without translation noise.
function pickStageKeys(
export function pickStageKeys(
status: string | undefined,
taskMessages: readonly TaskMessagePayload[],
availability: AgentAvailability | undefined,
elapsedSecs: number,
): { stageKey: StageKey; toolKey?: ToolKey; static?: boolean } {
if (
(status === "queued" || status === "dispatched") &&
@@ -73,6 +83,26 @@ function pickStageKeys(
) {
return { stageKey: "reconnecting" };
}
// Queued / dispatched too long while the runtime still appears online.
// The backend-reported "online" status lags up to the runtime-sweeper's
// ~150s window, so this state legitimately means "daemon is heartbeating
// (or recently was) but isn't picking up the task". A static label flagged
// as stuck gives the user something to act on instead of an unbounded
// "queued · 90s · 120s · …" timer.
//
// Gated on `availability === "online"` (NOT `!== "offline"`): when
// presence is still loading or temporarily unavailable, chat-window
// surfaces it as `undefined` precisely so we don't speculate about
// reachability. Treating undefined as "stuck" would slap a "Daemon not
// responding" diagnosis onto users whose runtime might be perfectly
// healthy but whose presence query is slow.
if (
(status === "queued" || status === "dispatched") &&
availability === "online" &&
elapsedSecs >= STUCK_THRESHOLD_SECS
) {
return { stageKey: "stuck", static: true };
}
if (status === "queued") return { stageKey: "queued" };
if (status === "dispatched") return { stageKey: "starting_up" };
@@ -103,10 +133,11 @@ function useResolveStage(): (
status: string | undefined,
taskMessages: readonly TaskMessagePayload[],
availability: AgentAvailability | undefined,
elapsedSecs: number,
) => Stage {
const { t } = useT("chat");
return (status, taskMessages, availability) => {
const decision = pickStageKeys(status, taskMessages, availability);
return (status, taskMessages, availability, elapsedSecs) => {
const decision = pickStageKeys(status, taskMessages, availability, elapsedSecs);
if (decision.toolKey) {
return {
label: t(($) => $.status_pill.tools[decision.toolKey!]),
@@ -151,7 +182,7 @@ export function TaskStatusPill({
// running; we trust that observation over a stale cache.
const status = taskMessages.length > 0 ? "running" : pendingTask.status;
const elapsedSecs = Math.max(0, Math.floor((now - anchor) / 1000));
const stage = resolveStage(status, taskMessages, availability);
const stage = resolveStage(status, taskMessages, availability, elapsedSecs);
return (
<div

View File

@@ -92,13 +92,16 @@
"offline_banner": {
"fallback_name": "the agent",
"unstable": "{{name}}'s connection is unstable — replies may be delayed.",
"offline": "{{name}} is offline — your message will be delivered when they're back."
"offline": "{{name}} is offline — your message will be delivered when they're back.",
"diagnose_hint_prefix": "Local daemon down? Run ",
"diagnose_hint_suffix": " to check."
},
"status_pill": {
"stages": {
"offline": "Offline",
"offline": "Runtime offline",
"reconnecting": "Reconnecting",
"queued": "Queued",
"stuck": "Daemon not responding",
"starting_up": "Starting up",
"thinking": "Thinking",
"typing": "Typing"

View File

@@ -88,13 +88,16 @@
"offline_banner": {
"fallback_name": "智能体",
"unstable": "{{name}} 的连接不稳定——回复可能延迟。",
"offline": "{{name}} 离线——你的消息将在它上线后发送。"
"offline": "{{name}} 离线——你的消息将在它上线后发送。",
"diagnose_hint_prefix": "本地 daemon 没起来?运行 ",
"diagnose_hint_suffix": " 排查。"
},
"status_pill": {
"stages": {
"offline": "离线",
"offline": "Runtime 离线",
"reconnecting": "重连中",
"queued": "排队中",
"stuck": "Daemon 无响应",
"starting_up": "启动中",
"thinking": "思考中",
"typing": "输入中"