diff --git a/model/parsers/qwen3coder.go b/model/parsers/qwen3coder.go index 0cff1ec15e..f44d7c8efd 100644 --- a/model/parsers/qwen3coder.go +++ b/model/parsers/qwen3coder.go @@ -11,6 +11,7 @@ import ( "strconv" "strings" "unicode" + "unicode/utf8" "github.com/ollama/ollama/api" "github.com/ollama/ollama/logutil" @@ -204,12 +205,21 @@ func overlap(s, delim string) int { } func trailingWhitespaceLen(s string) int { - for i := len(s) - 1; i >= 0; i-- { - if !unicode.IsSpace(rune(s[i])) { - return len(s) - i - 1 + remaining := s + total := 0 + for len(remaining) > 0 { + r, size := utf8.DecodeLastRuneInString(remaining) + // if it's an invalid utf8 rune, assume it isn't whitespace + if r == utf8.RuneError && size == 1 { + break } + if !unicode.IsSpace(r) { + break + } + total += size + remaining = remaining[:len(remaining)-size] } - return len(s) + return total } type XMLFunctionCall struct { diff --git a/model/parsers/qwen3coder_test.go b/model/parsers/qwen3coder_test.go index 43823e6fc6..c77fe2d95f 100644 --- a/model/parsers/qwen3coder_test.go +++ b/model/parsers/qwen3coder_test.go @@ -166,6 +166,137 @@ func TestQwenParserStreaming(t *testing.T) { }, }, }, + { + desc: "unicode content", + steps: []step{ + { + input: "你好 🌍testمرحبا", + wantEvents: []qwenEvent{ + qwenEventContent{content: "你好 🌍"}, + qwenEventRawToolCall{raw: "test"}, + qwenEventContent{content: "مرحبا"}, + }, + }, + }, + }, + { + desc: "arabic text handling", + steps: []step{ + { + input: "مرحبا بالعالم", + wantEvents: []qwenEvent{qwenEventContent{content: "مرحبا بالعالم"}}, + }, + }, + }, + { + desc: "emoji passthrough", + steps: []step{ + { + input: "✅", + wantEvents: []qwenEvent{qwenEventContent{content: "✅"}}, + }, + }, + }, + { + desc: "emoji after tool call", + steps: []step{ + { + input: "test完成 ✅", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "test"}, + qwenEventContent{content: "完成 ✅"}, + }, + }, + }, + }, + { + desc: "unicode streaming with whitespace handling", + steps: []step{ + { + input: "مرحبا", + wantEvents: []qwenEvent{ + qwenEventContent{content: "مرحبا"}, + }, + }, + { + input: " \n", + wantEvents: []qwenEvent{}, + }, + { + input: "世界", + wantEvents: []qwenEvent{ + qwenEventContent{content: " \n世界"}, + }, + }, + }, + }, + { + desc: "non-breaking space withheld across chunks", + steps: []step{ + { + input: "Hello\u00a0", + wantEvents: []qwenEvent{ + qwenEventContent{content: "Hello"}, + }, + }, + { + input: "world", + wantEvents: []qwenEvent{ + qwenEventContent{content: "\u00a0world"}, + }, + }, + }, + }, + { + desc: "ideographic space before partial tool", + steps: []step{ + { + input: "Hello\u3000abc", + wantEvents: []qwenEvent{}, + }, + { + input: "def", + wantEvents: []qwenEvent{ + qwenEventRawToolCall{raw: "abc"}, + qwenEventContent{content: "def"}, + }, + }, + }, + }, + { + desc: "ideographic space before partial tool fakeout", + steps: []step{ + { + input: "Hello\u3000abc", + wantEvents: []qwenEvent{ + qwenEventContent{content: "\u3000abc"}, + }, + }, + }, + }, + { + desc: "unicode with partial tool tag", + steps: []step{ + { + input: "测试🎯 b and a < b" }, }, }, + { + name: "unicode in function names and parameters", + tools: []api.Tool{}, + rawToolCall: ` + +北京 + + +Hello! 你好! 🌟 مرحبا + +`, + wantToolCall: api.ToolCall{ + Function: api.ToolCallFunction{ + Name: "获取天气", + Arguments: map[string]any{ + "城市": "北京", + "message": "Hello! 你好! 🌟 مرحبا", + }, + }, + }, + }, } for i, step := range steps { @@ -360,6 +512,42 @@ ls && echo "a > b and a < b" } } +func TestTrailingWhitespaceLenUnicode(t *testing.T) { + cases := []struct { + name string + input string + want int + }{ + { + name: "ascii space", + input: "Hello ", + want: 1, + }, + { + name: "non-breaking space", + input: "Hello\u00a0", + want: 2, + }, + { + name: "ideographic space", + input: "Hello\u3000", + want: 3, + }, + { + name: "multiple runes of whitespace", + input: "Hi\u00a0\u3000", + want: 5, + }, + } + + for _, tc := range cases { + got := trailingWhitespaceLen(tc.input) + if got != tc.want { + t.Errorf("%s: trailingWhitespaceLen(%q) = %d, want %d", tc.name, tc.input, got, tc.want) + } + } +} + func TestQwenToolCallValueParsing(t *testing.T) { cases := []struct { desc string @@ -867,6 +1055,8 @@ func TestTrailingWhitespaceLen(t *testing.T) { {desc: "trailing whitespace with newlines", s: "abc \n", want: 2}, {desc: "only whitespace", s: " \n ", want: 4}, {desc: "leading whitespace doesn't count", s: " \n abc", want: 0}, + {desc: "unicode with trailing space", s: "测试🎯 ", want: 1}, + {desc: "unicode with trailing tab and newline", s: "مرحبا\t\n", want: 2}, } for _, tc := range cases { @@ -876,3 +1066,30 @@ func TestTrailingWhitespaceLen(t *testing.T) { } } } + +func TestOverlapFunction(t *testing.T) { + cases := []struct { + desc string + s string + delim string + want int + }{ + {desc: "no overlap", s: "hello", delim: "", want: 5}, + {desc: "partial overlap", s: "hello", want: 3}, + {desc: "unicode with partial overlap", s: "测试🎯", want: 3}, + {desc: "unicode string with no overlap", s: "مرحبا", delim: "", want: 0}, + {desc: "unicode at boundary", s: "世界<", delim: "", want: 1}, + {desc: "unicode delimiter single rune", s: "hello🔧", delim: "🔧工具", want: len("🔧")}, + {desc: "unicode delimiter multiple runes", s: "hello🔧工", delim: "🔧工具", want: len("🔧工")}, + } + + for _, tc := range cases { + t.Run(tc.desc, func(t *testing.T) { + got := overlap(tc.s, tc.delim) + if got != tc.want { + t.Errorf("overlap(%q, %q) = %d, want %d", tc.s, tc.delim, got, tc.want) + } + }) + } +}