refactor prcess text tests

This commit is contained in:
Michael Yang 2025-01-29 15:29:30 -08:00
parent 624bfb0b11
commit 109ad1da0f
12 changed files with 472282 additions and 731 deletions

View File

@ -29,16 +29,16 @@ type Model struct {
func New(c ml.Config) (model.Model, error) {
return &Model{
BytePairEncoding: model.BytePairEncoding{
Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
Vocabulary: &model.Vocabulary{
BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Uints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
},
},
),
Layers: make([]Layer, c.Uint("block_count")),
Options: &Options{
hiddenSize: int64(c.Uint("embedding_length")),

View File

@ -8,6 +8,7 @@ import (
type Model struct {
model.Base
model.BytePairEncoding
*VisionModel `gguf:"v,vision"`
*TextModel
@ -15,14 +16,22 @@ type Model struct {
Projector *nn.Linear `gguf:"mm.0"`
ImageProcessor
TextProcessor
}
func New(c ml.Config) (model.Model, error) {
return &Model{
BytePairEncoding: model.NewBytePairEncoding(
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Uints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
},
),
ImageProcessor: newImageProcessor(c),
VisionModel: newVisionModel(c),
TextProcessor: newTextProcessor(c),
TextModel: newTextModel(c),
}, nil
}

View File

@ -1,25 +0,0 @@
package mllama
import (
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
)
type TextProcessor struct {
model.BytePairEncoding
}
func newTextProcessor(c ml.Config) TextProcessor {
return TextProcessor{
BytePairEncoding: model.BytePairEncoding{
Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
Vocabulary: &model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Uints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
},
},
}
}

View File

@ -1,87 +0,0 @@
package mllama
import (
"encoding/json"
"errors"
"os"
"path/filepath"
"strconv"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/ollama/ollama/model"
)
func TestProcessText(t *testing.T) {
ours, err := model.New(filepath.Join("testdata", "model.bin"))
if errors.Is(err, os.ErrNotExist) {
t.Skip("no model.bin")
} else if err != nil {
t.Fatal(err)
}
t.Run("decode", func(t *testing.T) {
f, err := os.Open(filepath.Join("testdata", "theirs.json"))
if errors.Is(err, os.ErrNotExist) {
t.Skip("no theirs.json")
} else if err != nil {
t.Fatal(err)
}
defer f.Close()
var theirs [][]byte
if err := json.NewDecoder(f).Decode(&theirs); err != nil {
t.Fatal(err)
}
for id := range theirs {
ids := []int32{int32(id)}
s, err := ours.(model.TextProcessor).Decode(ids)
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(string(theirs[id]), s); diff != "" {
t.Errorf("%d no match (-theirs +ours):\n%s", id, diff)
}
}
})
t.Run("encode", func(t *testing.T) {
f, err := os.Open(filepath.Join("..", "testdata", "inputs.json"))
if errors.Is(err, os.ErrNotExist) {
t.Skip("no inputs.json")
} else if err != nil {
t.Fatal(err)
}
defer f.Close()
var inputs []struct {
Values []byte `json:"base64"`
IDs []int32 `json:"ids"`
}
if err := json.NewDecoder(f).Decode(&inputs); err != nil {
t.Fatal(err)
}
for i, input := range inputs {
if i == 45 {
t.Skip("skip 45")
}
t.Run(strconv.Itoa(i), func(t *testing.T) {
ids, err := ours.(model.TextProcessor).Encode(string(input.Values))
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(input.IDs, ids, cmpopts.EquateEmpty()); diff != "" {
t.Errorf("%s: no match (-theirs +ours):\n%s", input.Values, diff)
}
})
}
})
}

View File

@ -1 +0,0 @@
/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf

File diff suppressed because one or more lines are too long

View File

@ -2,6 +2,7 @@ package model
import (
"cmp"
"iter"
"log/slog"
"strings"
"sync"
@ -99,23 +100,29 @@ func (v *Vocabulary) Merge(left, right string) int {
}
type BytePairEncoding struct {
Pretokenizer string
*Vocabulary
pre *regexp2.Regexp
vocab *Vocabulary
}
func (bpe BytePairEncoding) split(s string) ([]string, error) {
re, err := regexp2.Compile(bpe.Pretokenizer, regexp2.Unicode|regexp2.RE2)
if err != nil {
return nil, err
func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
return BytePairEncoding{
pre: regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
vocab: vocab,
}
}
var matches []string
for m, _ := re.FindStringMatch(s); m != nil; m, _ = re.FindNextMatch(m) {
matches = append(matches, m.String())
func (bpe BytePairEncoding) Is(id uint32, special Special) bool {
return bpe.vocab.Is(id, special)
}
func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
return func(yield func(string) bool) {
for m, _ := bpe.pre.FindStringMatch(s); m != nil; m, _ = bpe.pre.FindNextMatch(m) {
if !yield(m.String()) {
break
}
}
}
return matches, nil
}
// fragment is a string fragment and their corresponding token IDs
@ -138,9 +145,9 @@ type merge struct {
func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
fragments := []fragment{{value: s}}
for _, special := range bpe.Vocabulary.SpecialVocabulary() {
for _, special := range bpe.vocab.SpecialVocabulary() {
// TODO: process special tokens concurrently
id := bpe.Vocabulary.Encode(special)
id := bpe.vocab.Encode(special)
for i := 0; i < len(fragments); i++ {
frag := fragments[i]
if len(frag.ids) > 0 {
@ -173,13 +180,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
continue
}
// split fragment using pretokenizer
splits, err := bpe.split(frag.value)
if err != nil {
return nil, err
}
for _, split := range splits {
for split := range bpe.split(frag.value) {
// TODO: process splits concurrently
var sb strings.Builder
for _, b := range []byte(split) {
@ -197,7 +198,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
}
// short circuit if the fragment is in the vocabulary
if id := bpe.Vocabulary.Encode(sb.String()); id >= 0 {
if id := bpe.vocab.Encode(sb.String()); id >= 0 {
ids = append(ids, id)
slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
continue
@ -219,7 +220,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
}
left, right := string(merges[a].runes), string(merges[b].runes)
rank := bpe.Vocabulary.Merge(left, right)
rank := bpe.vocab.Merge(left, right)
if rank < 0 {
return nil
}
@ -271,7 +272,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
for _, merge := range merges {
if len(merge.runes) > 0 {
// TODO: handle the edge case where the rune isn't in the vocabulary
if id := bpe.Vocabulary.Encode(string(merge.runes)); id >= 0 {
if id := bpe.vocab.Encode(string(merge.runes)); id >= 0 {
ids = append(ids, id)
slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
}
@ -286,7 +287,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
var sb strings.Builder
for _, id := range ids {
for _, r := range bpe.Vocabulary.Decode(id) {
for _, r := range bpe.vocab.Decode(id) {
switch {
case r == 0x0100:
// this produces 0x00 aka NULL

247
model/process_text_test.go Normal file
View File

@ -0,0 +1,247 @@
package model
import (
"bufio"
"encoding/json"
"math"
"os"
"path/filepath"
"slices"
"strconv"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
func llama(t testing.TB) BytePairEncoding {
t.Helper()
f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
vocab := make(map[string]int32)
if err := json.NewDecoder(f).Decode(&vocab); err != nil {
t.Fatal(err)
}
types := make([]uint32, len(vocab))
tokens := make([]string, len(vocab))
for token, id := range vocab {
tokens[id] = token
types[id] = 1
}
for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
if _, ok := vocab[token]; !ok {
tokens = append(tokens, token) //nolint:makezero
types = append(types, 3) //nolint:makezero
vocab[token] = int32(len(vocab))
}
}
f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
merges := make([]string, 0, 50000)
scanner := bufio.NewScanner(f)
for scanner.Scan() {
if !strings.HasPrefix(scanner.Text(), "#") {
merges = append(merges, scanner.Text())
}
}
return NewBytePairEncoding(
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
&Vocabulary{
Values: tokens,
Types: types,
Merges: merges,
},
)
}
func TestLlama(t *testing.T) {
tokenizer := llama(t)
t.Run("simple", func(t *testing.T) {
t.Parallel()
ids, err := tokenizer.Encode("hello world")
if err != nil {
t.Error(err)
}
if diff := cmp.Diff([]int32{15339, 1917}, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
s, err := tokenizer.Decode([]int32{15339, 1917})
if err != nil {
t.Fatal(err)
}
if s != "hello world" {
t.Errorf("got %q, want hello world", s)
}
ids, err = tokenizer.Encode("hello <|end_of_text|>")
if err != nil {
t.Error(err)
}
if diff := cmp.Diff([]int32{15339, 220, 128001}, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
})
t.Run("simple repeated", func(t *testing.T) {
t.Parallel()
cases := map[string][]int32{
strings.Repeat("0", 1): {15},
strings.Repeat("0", 2): {410},
strings.Repeat("0", 3): {931},
strings.Repeat("0", 4): {931, 15},
strings.Repeat("0", 5): {931, 410},
strings.Repeat("0", 6): {931, 931},
strings.Repeat("0", 7): {931, 931, 15},
strings.Repeat("0", 8): {931, 931, 410},
strings.Repeat("0", 9): {931, 931, 931},
strings.Repeat("0", 10): {931, 931, 931, 15},
strings.Repeat("0", 11): {931, 931, 931, 410},
strings.Repeat("0", 12): {931, 931, 931, 931},
strings.Repeat("0", 13): {931, 931, 931, 931, 15},
strings.Repeat("0", 14): {931, 931, 931, 931, 410},
strings.Repeat("0", 15): {931, 931, 931, 931, 931},
strings.Repeat("0", 16): {931, 931, 931, 931, 931, 15},
strings.Repeat("0", 17): {931, 931, 931, 931, 931, 410},
}
for s, want := range cases {
ids, err := tokenizer.Encode(s)
if err != nil {
t.Error(err)
}
if diff := cmp.Diff(want, ids); diff != "" {
t.Errorf("%q no match (-theirs +ours):\n%s", s, diff)
}
}
})
t.Run("basic roundtrip", func(t *testing.T) {
t.Parallel()
cases := []string{
"hello",
"hello ",
"hello ",
" hello",
" hello ",
" hello ",
"hello world",
"请考试我的软件12345",
}
for _, want := range cases {
ids, err := tokenizer.Encode(want)
if err != nil {
t.Error(err)
}
if got, err := tokenizer.Decode(ids); err != nil {
t.Fatal(err)
} else if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
})
t.Run("special", func(t *testing.T) {
t.Parallel()
cases := map[string][]int32{
"<|begin_of_text|>A B!": {128000, 32, 426, 0},
"<|begin_of_text|>A<|end_of_text|>B!": {128000, 32, 128001, 33, 0},
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!": {128000, 32, 128001, 33, 128000, 0},
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
}
for s, want := range cases {
ids, err := tokenizer.Encode(s)
if err != nil {
t.Fatal(err)
}
if diff := cmp.Diff(want, ids); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
}
})
t.Run("split", func(t *testing.T) {
t.Parallel()
cases := map[string][]string{
"Hello World!": {"Hello", " World", "!"},
"I'm don't won't": {"I", "'m", " don", "'t", " won", "'t"},
"In 2024 there are 366 days": {"In", " ", "202", "4", " there", " are", " ", "366", " days"},
"Hello!! ...world": {"Hello", "!!", " ...", "world"},
"Hello World": {"Hello", " ", " World"},
"Hello\nWorld": {"Hello", "\n", "World"},
"Hello, WORLD!! How's it going?": {"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
}
for s, want := range cases {
got := slices.Collect(tokenizer.split(s))
if diff := cmp.Diff(want, got); diff != "" {
t.Errorf("no match (-theirs +ours):\n%s", diff)
}
}
})
}
func Benchmark(b *testing.B) {
tokenizer := llama(b)
bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
if err != nil {
b.Fatal(err)
}
for i := range 8 {
n := min(int(math.Pow10(i)), len(bts))
bts := bts[:n]
b.Run("encode"+strconv.Itoa(n), func(b *testing.B) {
b.ResetTimer()
for range b.N {
_, err := tokenizer.Encode(string(bts))
if err != nil {
b.Fatal(err)
}
}
})
b.Run("decode"+strconv.Itoa(n), func(b *testing.B) {
ids, err := tokenizer.Encode(string(bts))
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for range b.N {
_, err := tokenizer.Decode(ids)
if err != nil {
b.Fatal(err)
}
}
})
}
}

View File

@ -1,586 +0,0 @@
[
{
"base64": "aWVkIDQgwr0gbW9udGhz",
"ids": [
1142,
220,
19,
220,
27154,
4038
]
},
{
"base64": "RsO8aHJlcg==",
"ids": [
37,
51853,
261
]
},
{
"base64": "",
"ids": []
},
{
"base64": "IA==",
"ids": [
220
]
},
{
"base64": "ICA=",
"ids": [
256
]
},
{
"base64": "ICAg",
"ids": [
262
]
},
{
"base64": "CQ==",
"ids": [
197
]
},
{
"base64": "Cg==",
"ids": [
198
]
},
{
"base64": "Cgo=",
"ids": [
271
]
},
{
"base64": "CgoK",
"ids": [
1432
]
},
{
"base64": "CQo=",
"ids": [
1602
]
},
{
"base64": "SGVsbG8gd29ybGQ=",
"ids": [
9906,
1917
]
},
{
"base64": "IEhlbGxvIHdvcmxk",
"ids": [
22691,
1917
]
},
{
"base64": "SGVsbG8gV29ybGQ=",
"ids": [
9906,
4435
]
},
{
"base64": "IEhlbGxvIFdvcmxk",
"ids": [
22691,
4435
]
},
{
"base64": "IEhlbGxvIFdvcmxkIQ==",
"ids": [
22691,
4435,
0
]
},
{
"base64": "SGVsbG8sIHdvcmxkIQ==",
"ids": [
9906,
11,
1917,
0
]
},
{
"base64": "IEhlbGxvLCB3b3JsZCE=",
"ids": [
22691,
11,
1917,
0
]
},
{
"base64": "IHRoaXMgaXMg8J+mmS5jcHA=",
"ids": [
420,
374,
11410,
99,
247,
13,
11055
]
},
{
"base64": "dzA0OCA3dHVpamsgZHNkZmh1",
"ids": [
86,
23904,
220,
22,
83,
2005,
42908,
11729,
3013,
17156
]
},
{
"base64": "0L3QtdGJ0L4g0L3QsCDQkdGK0LvQs9Cw0YDRgdC60Lg=",
"ids": [
79862,
102118,
13373,
64571,
34694,
3114,
112203,
80112
]
},
{
"base64": "4Z6A4Z624Z6T4Z+L4Z6P4Z+C4Z6W4Z634Z6f4Z+B4Z6f4Z6i4Z624Z6F4Z6B4Z6b4Z6F4Z+B4Z6J",
"ids": [
21549,
222,
98629,
241,
45358,
233,
21549,
237,
45358,
224,
21549,
244,
21549,
115,
21549,
253,
45358,
223,
21549,
253,
21549,
95,
98629,
227,
21549,
223,
21549,
249,
21549,
227,
45358,
223,
21549,
231
]
},
{
"base64": "8J+agCAobm9ybWFsKSDwn5i24oCN8J+Mq++4jyAobXVsdGlwbGUgZW1vamlzIGNvbmNhdGVuYXRlZCkg4pyFIChvbmx5IGVtb2ppIHRoYXQgaGFzIGl0cyBvd24gdG9rZW4p",
"ids": [
9468,
248,
222,
320,
8416,
8,
27623,
114,
102470,
9468,
234,
104,
31643,
320,
36773,
100166,
98634,
8,
26602,
227,
320,
3323,
43465,
430,
706,
1202,
1866,
4037,
8
]
},
{
"base64": "SGVsbG8=",
"ids": [
9906
]
},
{
"base64": "IEhlbGxv",
"ids": [
22691
]
},
{
"base64": "ICBIZWxsbw==",
"ids": [
220,
22691
]
},
{
"base64": "ICAgSGVsbG8=",
"ids": [
256,
22691
]
},
{
"base64": "ICAgIEhlbGxv",
"ids": [
262,
22691
]
},
{
"base64": "ICAgIEhlbGxvCiAgICBIZWxsbw==",
"ids": [
262,
22691,
198,
262,
22691
]
},
{
"base64": "ICg=",
"ids": [
320
]
},
{
"base64": "CiA9",
"ids": [
198,
284
]
},
{
"base64": "JyBlcmE=",
"ids": [
6,
11639
]
},
{
"base64": "SGVsbG8sIHknYWxsISBIb3cgYXJlIHlvdSDwn5iBID/miJHmg7PlnKhhcHBsZeW3peS9nDEzMTQxNTHlpKnvvZ4=",
"ids": [
9906,
11,
379,
65948,
0,
2650,
527,
499,
27623,
223,
949,
37046,
101067,
19000,
23182,
102301,
9263,
18136,
16,
36827,
21909
]
},
{
"base64": "ISEhISEh",
"ids": [
17523,
3001
]
},
{
"base64": "Mw==",
"ids": [
18
]
},
{
"base64": "MzM=",
"ids": [
1644
]
},
{
"base64": "MzMz",
"ids": [
8765
]
},
{
"base64": "MzMzMw==",
"ids": [
8765,
18
]
},
{
"base64": "MzMzMzM=",
"ids": [
8765,
1644
]
},
{
"base64": "MzMzMzMz",
"ids": [
8765,
8765
]
},
{
"base64": "MzMzMzMzMw==",
"ids": [
8765,
8765,
18
]
},
{
"base64": "MzMzMzMzMzM=",
"ids": [
8765,
8765,
1644
]
},
{
"base64": "MzMzMzMzMzMz",
"ids": [
8765,
8765,
8765
]
},
{
"base64": "Q+G7rWEgVmnhu4d0",
"ids": [
34,
91163,
101798
]
},
{
"base64": "IGRpc2NhcmRz",
"ids": [
2624,
2402
]
},
{
"base64": "CiAKCiAKCgogCSAJCSAJCiAgCiAgIAogICAgCiAgICAgCvCfmoAgKG5vcm1hbCkg8J+YtuKAjfCfjKvvuI8gKG11bHRpcGxlIGVtb2ppcyBjb25jYXRlbmF0ZWQpIOKchSDwn6aZ8J+mmSAzIDMzIDMzMyAzMzMzIDMzMzMzIDMzMzMzMyAzMzMzMzMzIDMzMzMzMzMzIDMuMyAzLi4zIDMuLi4zIOGegOGetuGek+Gfi+Gej+GfguGeluGet+Gen+GfgeGen+GeouGetuGehfCfmIEgP+aIkeaDs+WcqGFwcGxl5bel5L2cMTMxNDE1MeWkqe+9niAtLS0tLS09PT09PT09INC90LXRidC+INC90LAg0JHRitC70LPQsNGA0YHQutC4ICcnJycnJ2BgYGBgYGAiIiIiLi4uLi4uISEhISEhPz8/Pz8/IEkndmUgYmVlbiAndG9sZCBoZSdzIHRoZXJlLCAnUkUgeW91IHN1cmU/ICdNIG5vdCBzdXJlIEknbGwgbWFrZSBpdCwgJ0QgeW91IGxpa2Ugc29tZSB0ZWE/IFdlJ1ZlIGEnbEw=",
"ids": [
198,
4815,
15073,
66597,
8004,
1602,
2355,
79772,
11187,
9468,
248,
222,
320,
8416,
8,
27623,
114,
102470,
9468,
234,
104,
31643,
320,
36773,
100166,
98634,
8,
26602,
227,
11410,
99,
247,
9468,
99,
247,
220,
18,
220,
1644,
220,
8765,
220,
8765,
18,
220,
8765,
1644,
220,
8765,
8765,
220,
8765,
8765,
18,
220,
8765,
8765,
1644,
220,
18,
13,
18,
220,
18,
497,
18,
220,
18,
1131,
18,
220,
21549,
222,
98629,
241,
45358,
233,
21549,
237,
45358,
224,
21549,
244,
21549,
115,
21549,
253,
45358,
223,
21549,
253,
21549,
95,
98629,
227,
76460,
223,
949,
37046,
101067,
19000,
23182,
102301,
9263,
18136,
16,
36827,
21909,
56560,
54337,
19175,
102118,
13373,
64571,
34694,
3114,
112203,
80112,
3436,
106451,
14196,
14196,
74694,
3089,
3089,
29249,
17523,
3001,
27708,
7801,
358,
3077,
1027,
364,
83,
820,
568,
596,
1070,
11,
364,
793,
499,
2771,
30,
364,
44,
539,
2771,
358,
3358,
1304,
433,
11,
364,
35,
499,
1093,
1063,
15600,
30,
1226,
6,
43712,
264,
64966,
43
]
}
]

128002
model/testdata/llama3.2/encoder.json vendored Normal file

File diff suppressed because it is too large Load Diff

280147
model/testdata/llama3.2/vocab.bpe vendored Normal file

File diff suppressed because it is too large Load Diff

63845
model/testdata/war-and-peace.txt vendored Normal file

File diff suppressed because it is too large Load Diff