model: fix issues with spm tokenizer for Gemma 3 (#10081)

This commit is contained in:
Jeffrey Morgan
2025-04-02 13:22:56 -07:00
committed by GitHub
parent b42970063d
commit b51e0f397c
5 changed files with 175 additions and 113 deletions

View File

@@ -45,7 +45,6 @@ func newTextModel(c ml.Config) *TextModel {
m := TextModel{
SentencePieceModel: model.NewSentencePieceModel(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Scores: c.Floats("tokenizer.ggml.scores"),