mirror of
https://github.com/ollama/ollama.git
synced 2025-11-10 23:37:17 +01:00
multi-regexp pretokenizer (#12325)
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"iter"
|
"iter"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"slices"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/dlclark/regexp2"
|
"github.com/dlclark/regexp2"
|
||||||
@@ -13,16 +14,28 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type BytePairEncoding struct {
|
type BytePairEncoding struct {
|
||||||
pre *regexp2.Regexp
|
|
||||||
vocab *Vocabulary
|
vocab *Vocabulary
|
||||||
|
regexps []*regexp2.Regexp
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ TextProcessor = (*BytePairEncoding)(nil)
|
var _ TextProcessor = (*BytePairEncoding)(nil)
|
||||||
|
|
||||||
func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
|
func NewBytePairEncoding(vocab *Vocabulary, pretokenizers ...string) BytePairEncoding {
|
||||||
|
if len(pretokenizers) == 0 {
|
||||||
|
// set default byte-level pretokenizer if none provided, e.g.
|
||||||
|
// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/pre_tokenizers/byte_level.rs#L44
|
||||||
|
pretokenizers = []string{`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`}
|
||||||
|
}
|
||||||
|
|
||||||
return BytePairEncoding{
|
return BytePairEncoding{
|
||||||
pre: regexp2.MustCompile(pre, regexp2.None),
|
|
||||||
vocab: vocab,
|
vocab: vocab,
|
||||||
|
regexps: slices.Collect(func(yield func(*regexp2.Regexp) bool) {
|
||||||
|
for _, p := range pretokenizers {
|
||||||
|
if !yield(regexp2.MustCompile(p, regexp2.RE2)) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -35,13 +48,36 @@ func (bpe BytePairEncoding) Is(id int32, special Special) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
|
func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
|
||||||
return func(yield func(string) bool) {
|
parts := []string{s}
|
||||||
for m, _ := bpe.pre.FindStringMatch(s); m != nil; m, _ = bpe.pre.FindNextMatch(m) {
|
for _, re := range bpe.regexps {
|
||||||
|
parts = slices.Collect(func(yield func(string) bool) {
|
||||||
|
for _, part := range parts {
|
||||||
|
r := []rune(part)
|
||||||
|
var offset int
|
||||||
|
for m, _ := re.FindRunesMatch(r); m != nil; m, _ = re.FindNextMatch(m) {
|
||||||
|
if offset-m.Index != 0 {
|
||||||
|
if !yield(string(r[:m.Index])) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if !yield(m.String()) {
|
if !yield(m.String()) {
|
||||||
break
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
offset = m.Index + m.Length
|
||||||
|
}
|
||||||
|
|
||||||
|
if offset < len(r) {
|
||||||
|
if !yield(string(r[offset:])) {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return slices.Values(parts)
|
||||||
}
|
}
|
||||||
|
|
||||||
// fragment is a string fragment and their corresponding token IDs
|
// fragment is a string fragment and their corresponding token IDs
|
||||||
|
|||||||
@@ -59,12 +59,12 @@ func llama(t testing.TB) BytePairEncoding {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return NewBytePairEncoding(
|
return NewBytePairEncoding(
|
||||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
|
||||||
&Vocabulary{
|
&Vocabulary{
|
||||||
Values: tokens,
|
Values: tokens,
|
||||||
Types: types,
|
Types: types,
|
||||||
Merges: merges,
|
Merges: merges,
|
||||||
},
|
},
|
||||||
|
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -282,3 +282,41 @@ func BenchmarkBytePairEncoding(b *testing.B) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSplit(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
patterns,
|
||||||
|
want []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "default",
|
||||||
|
want: []string{"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?", " 123", " 一二三"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "unicode",
|
||||||
|
patterns: []string{
|
||||||
|
"\\p{N}{1,3}",
|
||||||
|
`[一-龥-ゟ゠-ヿ]+`,
|
||||||
|
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
},
|
||||||
|
want: []string{"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?", " ", "123", " ", "一二三"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "individual digits",
|
||||||
|
patterns: []string{
|
||||||
|
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
},
|
||||||
|
want: []string{"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?", " ", "1", "2", "3", " 一二三"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range cases {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
tokenizer := NewBytePairEncoding(nil, tt.patterns...)
|
||||||
|
if diff := cmp.Diff(tt.want, slices.Collect(tokenizer.split("Hello, WORLD!! How's it going? 123 一二三"))); diff != "" {
|
||||||
|
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -227,17 +227,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
m := Transformer{
|
m := Transformer{
|
||||||
TransformerBlocks: make([]TransformerBlock, c.Uint("block_count")),
|
TransformerBlocks: make([]TransformerBlock, c.Uint("block_count")),
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer",
|
|
||||||
strings.Join([]string{
|
|
||||||
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
|
|
||||||
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
|
|
||||||
`\p{N}{1,3}`,
|
|
||||||
` ?[^\s\p{L}\p{N}]+[\r\n/]*`,
|
|
||||||
`\s*[\r\n]+`,
|
|
||||||
`\s+(?!\S)`,
|
|
||||||
`\s+`,
|
|
||||||
}, "|"),
|
|
||||||
),
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -250,6 +239,15 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
strings.Join([]string{
|
||||||
|
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
|
||||||
|
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
|
||||||
|
`\p{N}{1,3}`,
|
||||||
|
` ?[^\s\p{L}\p{N}]+[\r\n/]*`,
|
||||||
|
`\s*[\r\n]+`,
|
||||||
|
`\s+(?!\S)`,
|
||||||
|
`\s+`,
|
||||||
|
}, "|"),
|
||||||
),
|
),
|
||||||
Options: Options{
|
Options: Options{
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
|
|||||||
@@ -54,10 +54,30 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
}
|
}
|
||||||
switch c.String("tokenizer.ggml.model") {
|
switch c.String("tokenizer.ggml.model") {
|
||||||
case "gpt2":
|
case "gpt2":
|
||||||
processor = model.NewBytePairEncoding(
|
var pretokenizers []string
|
||||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
switch c.String("tokenizer.ggml.pre") {
|
||||||
&vocabulary,
|
case "default":
|
||||||
)
|
// no-op use the default bpe pretokenizer
|
||||||
|
case "qwen2":
|
||||||
|
pretokenizers = []string{
|
||||||
|
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
}
|
||||||
|
case "refact":
|
||||||
|
pretokenizers = []string{
|
||||||
|
`\p{N}`,
|
||||||
|
`'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+`,
|
||||||
|
}
|
||||||
|
case "tekken":
|
||||||
|
pretokenizers = []string{
|
||||||
|
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// use a llama-style pretokenizer
|
||||||
|
pretokenizers = []string{
|
||||||
|
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
processor = model.NewBytePairEncoding(&vocabulary, pretokenizers...)
|
||||||
case "llama":
|
case "llama":
|
||||||
processor = model.NewSentencePiece(&vocabulary)
|
processor = model.NewSentencePiece(&vocabulary)
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -34,8 +34,6 @@ func (p *Projector) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
|
|||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
m := Model{
|
m := Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer",
|
|
||||||
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -48,6 +46,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||||
),
|
),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
VisionModel: newVisionModel(c),
|
VisionModel: newVisionModel(c),
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ var _ model.TextProcessor = (*Model)(nil)
|
|||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
m := &Model{
|
m := &Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -46,6 +45,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||||
),
|
),
|
||||||
TextModel: newTextModel(c),
|
TextModel: newTextModel(c),
|
||||||
VisionModel: newVisionModel(c),
|
VisionModel: newVisionModel(c),
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ const (
|
|||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
m := Model{
|
m := Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -46,6 +45,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||||
),
|
),
|
||||||
ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
VisionModel: newVisionModel(c),
|
VisionModel: newVisionModel(c),
|
||||||
|
|||||||
@@ -139,7 +139,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
m := Model{
|
m := Model{
|
||||||
Layers: make([]DecoderLayer, c.Uint("block_count")),
|
Layers: make([]DecoderLayer, c.Uint("block_count")),
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -152,6 +151,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||||
),
|
),
|
||||||
Options: Options{
|
Options: Options{
|
||||||
hiddenSize: int(c.Uint("embedding_length")),
|
hiddenSize: int(c.Uint("embedding_length")),
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ var _ model.MultimodalProcessor = (*Model)(nil)
|
|||||||
func New(c fs.Config) (model.Model, error) {
|
func New(c fs.Config) (model.Model, error) {
|
||||||
m := &Model{
|
m := &Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -42,6 +41,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||||
),
|
),
|
||||||
TextModel: NewTextModel(c),
|
TextModel: NewTextModel(c),
|
||||||
VisionModel: newVisionModel(c),
|
VisionModel: newVisionModel(c),
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ func newEmbed(c fs.Config) (model.Model, error) {
|
|||||||
}
|
}
|
||||||
m := embedModel{
|
m := embedModel{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -48,6 +47,7 @@ func newEmbed(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||||
),
|
),
|
||||||
Model: &Model{
|
Model: &Model{
|
||||||
Layers: layers,
|
Layers: layers,
|
||||||
|
|||||||
@@ -200,7 +200,6 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
|
|
||||||
m := Model{
|
m := Model{
|
||||||
BytePairEncoding: model.NewBytePairEncoding(
|
BytePairEncoding: model.NewBytePairEncoding(
|
||||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||||
@@ -213,6 +212,7 @@ func New(c fs.Config) (model.Model, error) {
|
|||||||
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
c.Ints("tokenizer.ggml.eos_token_ids")...,
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||||
),
|
),
|
||||||
Layers: layers,
|
Layers: layers,
|
||||||
Options: &Options{
|
Options: &Options{
|
||||||
|
|||||||
@@ -82,7 +82,6 @@ func modelHelper(t testing.TB) model.BytePairEncoding {
|
|||||||
merges := make([]string, 0, 1)
|
merges := make([]string, 0, 1)
|
||||||
// Only need vocab for Grammar Test
|
// Only need vocab for Grammar Test
|
||||||
return model.NewBytePairEncoding(
|
return model.NewBytePairEncoding(
|
||||||
``,
|
|
||||||
&model.Vocabulary{
|
&model.Vocabulary{
|
||||||
Values: tokens,
|
Values: tokens,
|
||||||
Types: make([]int32, len(vocab)),
|
Types: make([]int32, len(vocab)),
|
||||||
|
|||||||
Reference in New Issue
Block a user