fix(integrations/lark): union_id-first self mention strip + token-aware scan + local whitespace cleanup (MUL-2671)

Three review blockers on the mention rewrite from PR review:

1. isBotMention now mirrors containsMention's union_id-first policy.
   When the installation row knows our union_id, we trust it
   exclusively (open_id is structurally inverted in multi-bot
   groups — matching on it would re-introduce the routing bug we
   fixed two commits ago). open_id fallback fires only when
   union_id is absent. New tests: @-ing both bots in one message
   correctly strips only self and renders the sibling as @<name>;
   open_id-matches-but-union_id-differs does NOT strip.

2. resolveMentions no longer collapses or trims whitespace globally.
   Indentation, tabs, code blocks, tables — all preserved verbatim.
   When the self mention is removed we eat exactly one adjacent
   horizontal space (the one after the placeholder, or, when the
   mention sits at end-of-input, a single space already emitted
   right before it). New test exercises a multi-line indented +
   tabbed body and asserts the whole shape survives.

3. Prefix-collision-safe replacement. A chat with 11+ participants
   exposes both `@_user_1` and `@_user_10`; naive ReplaceAll for
   `@_user_1` would mangle the substring of `@_user_10`. The
   resolver now does a single-pass token scan with the mention
   list sorted longest-key-first, so the longer placeholder always
   wins at any scan position. New test covers the @_user_1 /
   @_user_10 case explicitly.

Also drops the temporary INFO-level diag logging the previous
commit added — root cause was confirmed (union_id swap in the
manual backfill; not a decoder bug).

Co-authored-by: multica-agent <github@multica.ai>
This commit is contained in:
J
2026-06-03 02:04:59 +08:00
parent 6321d3d038
commit 9b741cdcdc
2 changed files with 175 additions and 54 deletions

View File

@@ -4,6 +4,7 @@ import (
"encoding/json"
"errors"
"fmt"
"sort"
"strings"
db "github.com/multica-ai/multica/server/pkg/db/generated"
@@ -153,68 +154,93 @@ type larkMention struct {
// when name is empty (defensive — Lark always populates it in
// practice).
//
// Whitespace cleanup: runs of horizontal whitespace introduced by
// stripping the bot mention are collapsed to a single space, and
// leading/trailing horizontal whitespace per line is trimmed. Line
// breaks in the original message are preserved.
// Replacement is a single-pass token scan, not naive ReplaceAll. Two
// reasons:
//
// - Prefix collision: a chat with eleven @-mentions exposes keys
// `@_user_1` and `@_user_10`; ReplaceAll for `@_user_1` would
// mangle the substring of `@_user_10`. We sort keys by length
// DESC and try the longest match at each scan position so the
// longer placeholder always wins.
//
// - Whitespace fidelity: when we strip the bot mention we only
// touch a single space immediately adjacent to it — either the
// space after the placeholder, or, if there is none, a single
// trailing space already in the output. Tabs, indentation, code
// blocks, table pipes, and any other intentional whitespace in
// the user's message are preserved verbatim.
func resolveMentions(text string, mentions []larkMention, botOpenID, botUnionID string) string {
if text == "" || len(mentions) == 0 {
return text
}
// Filter empty keys and sort longest first so `@_user_10` is
// matched before `@_user_1` at any scan position.
sorted := make([]larkMention, 0, len(mentions))
for _, m := range mentions {
if m.Key == "" {
continue
if m.Key != "" {
sorted = append(sorted, m)
}
var rep string
switch {
case isBotMention(m, botOpenID, botUnionID):
rep = ""
case m.Name != "":
rep = "@" + m.Name
default:
continue
}
text = strings.ReplaceAll(text, m.Key, rep)
}
return tidyMentionWhitespace(text)
}
sort.SliceStable(sorted, func(i, j int) bool {
return len(sorted[i].Key) > len(sorted[j].Key)
})
func isBotMention(m larkMention, botOpenID, botUnionID string) bool {
if botUnionID != "" && m.ID.UnionID == botUnionID {
return true
}
if botOpenID != "" && m.ID.OpenID == botOpenID {
return true
}
return false
}
// tidyMentionWhitespace collapses runs of spaces/tabs and trims
// horizontal whitespace at the start/end of each line. Newlines are
// preserved so multi-line user messages survive intact.
func tidyMentionWhitespace(s string) string {
if s == "" {
return s
}
lines := strings.Split(s, "\n")
for i, line := range lines {
var b strings.Builder
b.Grow(len(line))
prevSpace := false
for _, r := range line {
if r == ' ' || r == '\t' {
if !prevSpace {
b.WriteByte(' ')
}
prevSpace = true
continue
out := make([]byte, 0, len(text))
i := 0
for i < len(text) {
var matched *larkMention
for idx := range sorted {
if strings.HasPrefix(text[i:], sorted[idx].Key) {
matched = &sorted[idx]
break
}
prevSpace = false
b.WriteRune(r)
}
lines[i] = strings.TrimSpace(b.String())
if matched == nil {
out = append(out, text[i])
i++
continue
}
end := i + len(matched.Key)
switch {
case isBotMention(*matched, botOpenID, botUnionID):
// Strip: eat one adjacent space (after the placeholder
// preferred; else backtrack one space we already emitted)
// so the seam is not left with a double space or a
// dangling leading space. Tabs / newlines / other chars
// are untouched.
if end < len(text) && text[end] == ' ' {
end++
} else if n := len(out); n > 0 && out[n-1] == ' ' {
out = out[:n-1]
}
case matched.Name != "":
out = append(out, '@')
out = append(out, matched.Name...)
default:
// Unknown mention — leave the placeholder intact so the
// agent at least sees a stable token.
out = append(out, matched.Key...)
}
i = end
}
return strings.Join(lines, "\n")
return string(out)
}
// isBotMention identifies whether a payload mention refers to THIS
// bot. Stays in lockstep with containsMention: when union_id is
// known we trust it exclusively (open_id is structurally inverted
// in multi-bot groups — matching on it would re-introduce the
// MUL-2671 routing bug). Only when union_id is missing do we fall
// back to open_id, which is correct in single-bot installs and the
// best we can do in pre-backfill rows.
func isBotMention(m larkMention, botOpenID, botUnionID string) bool {
if botUnionID != "" {
return m.ID.UnionID == botUnionID
}
if botOpenID == "" {
return false
}
return m.ID.OpenID == botOpenID
}
func extractTextBody(content string) string {

View File

@@ -248,7 +248,11 @@ func TestLarkJSONFrameDecoderMentionPlaceholderRewrite(t *testing.T) {
}
})
t.Run("preserves newlines and collapses spaces from stripped mention", func(t *testing.T) {
t.Run("preserves newlines after stripped mention", func(t *testing.T) {
// Strip the bot mention + one adjacent space; the newline that
// follows stays put so the rest of the message keeps its
// shape. User-typed extra spaces (the double space here) are
// preserved verbatim — we do not globally collapse whitespace.
inst := db.LarkInstallation{
BotOpenID: "ou_bot",
BotUnionID: pgText("on_bot"),
@@ -258,8 +262,8 @@ func TestLarkJSONFrameDecoderMentionPlaceholderRewrite(t *testing.T) {
if err != nil || !ok {
t.Fatalf("ok=%v err=%v", ok, err)
}
if msg.Body != "first line\nsecond line" {
t.Errorf("Body = %q; want %q", msg.Body, "first line\nsecond line")
if msg.Body != " first line\nsecond line" {
t.Errorf("Body = %q; want %q", msg.Body, " first line\nsecond line")
}
})
@@ -276,6 +280,97 @@ func TestLarkJSONFrameDecoderMentionPlaceholderRewrite(t *testing.T) {
t.Errorf("Body = %q; want %q", msg.Body, "just a normal message")
}
})
t.Run("preserves indentation and tabs around stripped mention", func(t *testing.T) {
// Code-block / indented messages: stripping the bot mention
// must not eat the surrounding indent, tabs, or any internal
// whitespace the user intentionally typed. We only consume a
// single space directly adjacent to the placeholder.
inst := db.LarkInstallation{
BotOpenID: "ou_bot",
BotUnionID: pgText("on_bot"),
}
mentions := `[{"key":"@_user_1","name":"My Bot","id":{"open_id":"ou_bot_wire","union_id":"on_bot"}}]`
raw := " @_user_1 review this snippet:\n\tfunc add(a, b int) int {\n\t\treturn a + b\n\t}"
want := " review this snippet:\n\tfunc add(a, b int) int {\n\t\treturn a + b\n\t}"
msg, ok, err := d.Decode(mkRaw(raw, mentions), inst)
if err != nil || !ok {
t.Fatalf("ok=%v err=%v", ok, err)
}
if msg.Body != want {
t.Errorf("Body = %q; want %q", msg.Body, want)
}
})
t.Run("avoids @_user_1 / @_user_10 prefix collision", func(t *testing.T) {
// Lark assigns mention keys positionally; a chat with eleven+
// participants exposes both `@_user_1` and `@_user_10`. Naive
// ReplaceAll for `@_user_1` would mangle `@_user_10`, so we
// match longest-first.
inst := db.LarkInstallation{
BotOpenID: "ou_bot",
BotUnionID: pgText("on_bot"),
}
mentions := `[
{"key":"@_user_1","name":"My Bot","id":{"open_id":"ou_bot_wire","union_id":"on_bot"}},
{"key":"@_user_10","name":"Alice","id":{"open_id":"ou_alice","union_id":"on_alice"}}
]`
raw := "@_user_1 forward this to @_user_10 please"
want := "forward this to @Alice please"
msg, ok, err := d.Decode(mkRaw(raw, mentions), inst)
if err != nil || !ok {
t.Fatalf("ok=%v err=%v", ok, err)
}
if msg.Body != want {
t.Errorf("Body = %q; want %q", msg.Body, want)
}
})
t.Run("@-ing both bots in one message strips only self, renders other by name", func(t *testing.T) {
// Multi-bot group chat where the user @-mentions BOTH bots in
// the same message. From this WS's perspective only the self
// mention should be stripped; the sibling bot renders as
// @<displayName> so the agent receives a faithful transcript
// of the user intent.
inst := db.LarkInstallation{
BotOpenID: "ou_self_canonical",
BotUnionID: pgText("on_self_union"),
}
mentions := `[
{"key":"@_user_1","name":"Self Bot","id":{"open_id":"ou_self_wire","union_id":"on_self_union"}},
{"key":"@_user_2","name":"Sibling Bot","id":{"open_id":"ou_sibling_wire","union_id":"on_sibling_union"}}
]`
raw := "@_user_1 @_user_2 please coordinate"
want := "@Sibling Bot please coordinate"
msg, ok, err := d.Decode(mkRaw(raw, mentions), inst)
if err != nil || !ok {
t.Fatalf("ok=%v err=%v", ok, err)
}
if msg.Body != want {
t.Errorf("Body = %q; want %q", msg.Body, want)
}
})
t.Run("open_id match does NOT strip when union_id known but differs", func(t *testing.T) {
// Mirror of containsMention's union_id-first rule: when we
// know our union_id, an open_id-only match means the mention
// is for the OTHER bot (the inverse-mapping quirk), so we
// must render it as @<name>, not strip it.
inst := db.LarkInstallation{
BotOpenID: "ou_self_canonical",
BotUnionID: pgText("on_self_union"),
}
mentions := `[{"key":"@_user_1","name":"Sibling Bot","id":{"open_id":"ou_self_canonical","union_id":"on_sibling_union"}}]`
raw := "@_user_1 hi"
want := "@Sibling Bot hi"
msg, ok, err := d.Decode(mkRaw(raw, mentions), inst)
if err != nil || !ok {
t.Fatalf("ok=%v err=%v", ok, err)
}
if msg.Body != want {
t.Errorf("Body = %q; want %q", msg.Body, want)
}
})
}
func TestLarkJSONFrameDecoderDropsHeartbeat(t *testing.T) {