mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 09:57:01 +01:00
parsers: fix unicode handling for qwen3-coder
When trimming whitespace at the end of every chunk, we were iterating backwards over the string byte-by-byte instead of rune-by-rune. As an example of how this can cause corruption, suppose we have the multi-byte character ✅ (`"\u2705"`), which is represented in utf-8 as the three bytes `0xE2 0x9C 0x85`. It happens that `0x85` is NEL, which passes `unicode.IsSpace()`. Because we were iterating byte-by-byte, this caused us to mistakenly slice in the middle of the rune, removing `0x85` and leaving `0xE2 0x9C`, which beyond being the incorrect place to slice, is not even a valid utf-8 character. `trailingWhitespaceLen()` was modified to count from the end in a rune-aware way. Tests with various multibyte unicode characters were also added. Fixes: #12414
This commit is contained in:
@@ -11,6 +11,7 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
@@ -204,12 +205,21 @@ func overlap(s, delim string) int {
|
||||
}
|
||||
|
||||
func trailingWhitespaceLen(s string) int {
|
||||
for i := len(s) - 1; i >= 0; i-- {
|
||||
if !unicode.IsSpace(rune(s[i])) {
|
||||
return len(s) - i - 1
|
||||
remaining := s
|
||||
total := 0
|
||||
for len(remaining) > 0 {
|
||||
r, size := utf8.DecodeLastRuneInString(remaining)
|
||||
// if it's an invalid utf8 rune, assume it isn't whitespace
|
||||
if r == utf8.RuneError && size == 1 {
|
||||
break
|
||||
}
|
||||
if !unicode.IsSpace(r) {
|
||||
break
|
||||
}
|
||||
total += size
|
||||
remaining = remaining[:len(remaining)-size]
|
||||
}
|
||||
return len(s)
|
||||
return total
|
||||
}
|
||||
|
||||
type XMLFunctionCall struct {
|
||||
|
||||
Reference in New Issue
Block a user