From 9e125d884cf995dfae7fcd74690d525e4326a517 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 16 Jun 2025 16:03:16 -0700 Subject: [PATCH] model: treat 'user defined' tokens as special tokens (#11077) --- model/vocabulary.go | 2 +- model/vocabulary_test.go | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 model/vocabulary_test.go diff --git a/model/vocabulary.go b/model/vocabulary.go index 24adbaca32..a86de58dfa 100644 --- a/model/vocabulary.go +++ b/model/vocabulary.go @@ -87,7 +87,7 @@ func (v *Vocabulary) Decode(id int32) string { func (v *Vocabulary) SpecialVocabulary() []string { v.specialOnce.Do(func() { for i := range v.Values { - if v.Types[i] == TOKEN_TYPE_CONTROL { + if v.Types[i] == TOKEN_TYPE_CONTROL || v.Types[i] == TOKEN_TYPE_USER_DEFINED { v.special = append(v.special, v.Values[i]) } } diff --git a/model/vocabulary_test.go b/model/vocabulary_test.go new file mode 100644 index 0000000000..46f0ead23e --- /dev/null +++ b/model/vocabulary_test.go @@ -0,0 +1,16 @@ +package model + +import "testing" + +func TestVocabulary_SpecialVocabulary(t *testing.T) { + vocab := &Vocabulary{ + Values: []string{"<|startoftext|>", "<|endoftext|>", "<|tool_call_start|>", "<|tool_call_end|>", "hi"}, + Types: []int32{TOKEN_TYPE_CONTROL, TOKEN_TYPE_CONTROL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_NORMAL}, + } + + specialVocab := vocab.SpecialVocabulary() + + if len(specialVocab) != 4 { + t.Errorf("expected 4 special tokens, got %d", len(specialVocab)) + } +}