mirror of
https://github.com/ollama/ollama.git
synced 2025-04-21 22:15:04 +02:00
refactor prcess text tests
This commit is contained in:
parent
624bfb0b11
commit
109ad1da0f
@ -29,16 +29,16 @@ type Model struct {
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
return &Model{
|
||||
BytePairEncoding: model.BytePairEncoding{
|
||||
Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
Vocabulary: &model.Vocabulary{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
|
||||
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
|
||||
},
|
||||
},
|
||||
),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
Options: &Options{
|
||||
hiddenSize: int64(c.Uint("embedding_length")),
|
||||
|
@ -8,6 +8,7 @@ import (
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
*VisionModel `gguf:"v,vision"`
|
||||
*TextModel
|
||||
@ -15,14 +16,22 @@ type Model struct {
|
||||
Projector *nn.Linear `gguf:"mm.0"`
|
||||
|
||||
ImageProcessor
|
||||
TextProcessor
|
||||
}
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
return &Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
|
||||
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
|
||||
},
|
||||
),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
TextProcessor: newTextProcessor(c),
|
||||
TextModel: newTextModel(c),
|
||||
}, nil
|
||||
}
|
||||
|
@ -1,25 +0,0 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type TextProcessor struct {
|
||||
model.BytePairEncoding
|
||||
}
|
||||
|
||||
func newTextProcessor(c ml.Config) TextProcessor {
|
||||
return TextProcessor{
|
||||
BytePairEncoding: model.BytePairEncoding{
|
||||
Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
Vocabulary: &model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
|
||||
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
@ -1,87 +0,0 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
func TestProcessText(t *testing.T) {
|
||||
ours, err := model.New(filepath.Join("testdata", "model.bin"))
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
t.Skip("no model.bin")
|
||||
} else if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
t.Run("decode", func(t *testing.T) {
|
||||
f, err := os.Open(filepath.Join("testdata", "theirs.json"))
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
t.Skip("no theirs.json")
|
||||
} else if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var theirs [][]byte
|
||||
if err := json.NewDecoder(f).Decode(&theirs); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for id := range theirs {
|
||||
ids := []int32{int32(id)}
|
||||
s, err := ours.(model.TextProcessor).Decode(ids)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(string(theirs[id]), s); diff != "" {
|
||||
t.Errorf("%d no match (-theirs +ours):\n%s", id, diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("encode", func(t *testing.T) {
|
||||
f, err := os.Open(filepath.Join("..", "testdata", "inputs.json"))
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
t.Skip("no inputs.json")
|
||||
} else if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var inputs []struct {
|
||||
Values []byte `json:"base64"`
|
||||
IDs []int32 `json:"ids"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(f).Decode(&inputs); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for i, input := range inputs {
|
||||
if i == 45 {
|
||||
t.Skip("skip 45")
|
||||
}
|
||||
|
||||
t.Run(strconv.Itoa(i), func(t *testing.T) {
|
||||
ids, err := ours.(model.TextProcessor).Encode(string(input.Values))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(input.IDs, ids, cmpopts.EquateEmpty()); diff != "" {
|
||||
t.Errorf("%s: no match (-theirs +ours):\n%s", input.Values, diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
1
model/mllama/testdata/model.bin
vendored
1
model/mllama/testdata/model.bin
vendored
@ -1 +0,0 @@
|
||||
/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf
|
1
model/mllama/testdata/theirs.json
vendored
1
model/mllama/testdata/theirs.json
vendored
File diff suppressed because one or more lines are too long
@ -2,6 +2,7 @@ package model
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"iter"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"sync"
|
||||
@ -99,23 +100,29 @@ func (v *Vocabulary) Merge(left, right string) int {
|
||||
}
|
||||
|
||||
type BytePairEncoding struct {
|
||||
Pretokenizer string
|
||||
|
||||
*Vocabulary
|
||||
pre *regexp2.Regexp
|
||||
vocab *Vocabulary
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) split(s string) ([]string, error) {
|
||||
re, err := regexp2.Compile(bpe.Pretokenizer, regexp2.Unicode|regexp2.RE2)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
|
||||
return BytePairEncoding{
|
||||
pre: regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
|
||||
vocab: vocab,
|
||||
}
|
||||
}
|
||||
|
||||
var matches []string
|
||||
for m, _ := re.FindStringMatch(s); m != nil; m, _ = re.FindNextMatch(m) {
|
||||
matches = append(matches, m.String())
|
||||
func (bpe BytePairEncoding) Is(id uint32, special Special) bool {
|
||||
return bpe.vocab.Is(id, special)
|
||||
}
|
||||
|
||||
func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
|
||||
return func(yield func(string) bool) {
|
||||
for m, _ := bpe.pre.FindStringMatch(s); m != nil; m, _ = bpe.pre.FindNextMatch(m) {
|
||||
if !yield(m.String()) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return matches, nil
|
||||
}
|
||||
|
||||
// fragment is a string fragment and their corresponding token IDs
|
||||
@ -138,9 +145,9 @@ type merge struct {
|
||||
|
||||
func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
fragments := []fragment{{value: s}}
|
||||
for _, special := range bpe.Vocabulary.SpecialVocabulary() {
|
||||
for _, special := range bpe.vocab.SpecialVocabulary() {
|
||||
// TODO: process special tokens concurrently
|
||||
id := bpe.Vocabulary.Encode(special)
|
||||
id := bpe.vocab.Encode(special)
|
||||
for i := 0; i < len(fragments); i++ {
|
||||
frag := fragments[i]
|
||||
if len(frag.ids) > 0 {
|
||||
@ -173,13 +180,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
// split fragment using pretokenizer
|
||||
splits, err := bpe.split(frag.value)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, split := range splits {
|
||||
for split := range bpe.split(frag.value) {
|
||||
// TODO: process splits concurrently
|
||||
var sb strings.Builder
|
||||
for _, b := range []byte(split) {
|
||||
@ -197,7 +198,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
}
|
||||
|
||||
// short circuit if the fragment is in the vocabulary
|
||||
if id := bpe.Vocabulary.Encode(sb.String()); id >= 0 {
|
||||
if id := bpe.vocab.Encode(sb.String()); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
|
||||
continue
|
||||
@ -219,7 +220,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
}
|
||||
|
||||
left, right := string(merges[a].runes), string(merges[b].runes)
|
||||
rank := bpe.Vocabulary.Merge(left, right)
|
||||
rank := bpe.vocab.Merge(left, right)
|
||||
if rank < 0 {
|
||||
return nil
|
||||
}
|
||||
@ -271,7 +272,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
for _, merge := range merges {
|
||||
if len(merge.runes) > 0 {
|
||||
// TODO: handle the edge case where the rune isn't in the vocabulary
|
||||
if id := bpe.Vocabulary.Encode(string(merge.runes)); id >= 0 {
|
||||
if id := bpe.vocab.Encode(string(merge.runes)); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
|
||||
}
|
||||
@ -286,7 +287,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
|
||||
var sb strings.Builder
|
||||
for _, id := range ids {
|
||||
for _, r := range bpe.Vocabulary.Decode(id) {
|
||||
for _, r := range bpe.vocab.Decode(id) {
|
||||
switch {
|
||||
case r == 0x0100:
|
||||
// this produces 0x00 aka NULL
|
||||
|
247
model/process_text_test.go
Normal file
247
model/process_text_test.go
Normal file
@ -0,0 +1,247 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func llama(t testing.TB) BytePairEncoding {
|
||||
t.Helper()
|
||||
|
||||
f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
vocab := make(map[string]int32)
|
||||
if err := json.NewDecoder(f).Decode(&vocab); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
types := make([]uint32, len(vocab))
|
||||
tokens := make([]string, len(vocab))
|
||||
for token, id := range vocab {
|
||||
tokens[id] = token
|
||||
types[id] = 1
|
||||
}
|
||||
|
||||
for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
|
||||
if _, ok := vocab[token]; !ok {
|
||||
tokens = append(tokens, token) //nolint:makezero
|
||||
types = append(types, 3) //nolint:makezero
|
||||
vocab[token] = int32(len(vocab))
|
||||
}
|
||||
}
|
||||
|
||||
f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
merges := make([]string, 0, 50000)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
if !strings.HasPrefix(scanner.Text(), "#") {
|
||||
merges = append(merges, scanner.Text())
|
||||
}
|
||||
}
|
||||
|
||||
return NewBytePairEncoding(
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
&Vocabulary{
|
||||
Values: tokens,
|
||||
Types: types,
|
||||
Merges: merges,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
func TestLlama(t *testing.T) {
|
||||
tokenizer := llama(t)
|
||||
|
||||
t.Run("simple", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ids, err := tokenizer.Encode("hello world")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff([]int32{15339, 1917}, ids); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
|
||||
s, err := tokenizer.Decode([]int32{15339, 1917})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if s != "hello world" {
|
||||
t.Errorf("got %q, want hello world", s)
|
||||
}
|
||||
|
||||
ids, err = tokenizer.Encode("hello <|end_of_text|>")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff([]int32{15339, 220, 128001}, ids); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("simple repeated", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := map[string][]int32{
|
||||
strings.Repeat("0", 1): {15},
|
||||
strings.Repeat("0", 2): {410},
|
||||
strings.Repeat("0", 3): {931},
|
||||
strings.Repeat("0", 4): {931, 15},
|
||||
strings.Repeat("0", 5): {931, 410},
|
||||
strings.Repeat("0", 6): {931, 931},
|
||||
strings.Repeat("0", 7): {931, 931, 15},
|
||||
strings.Repeat("0", 8): {931, 931, 410},
|
||||
strings.Repeat("0", 9): {931, 931, 931},
|
||||
strings.Repeat("0", 10): {931, 931, 931, 15},
|
||||
strings.Repeat("0", 11): {931, 931, 931, 410},
|
||||
strings.Repeat("0", 12): {931, 931, 931, 931},
|
||||
strings.Repeat("0", 13): {931, 931, 931, 931, 15},
|
||||
strings.Repeat("0", 14): {931, 931, 931, 931, 410},
|
||||
strings.Repeat("0", 15): {931, 931, 931, 931, 931},
|
||||
strings.Repeat("0", 16): {931, 931, 931, 931, 931, 15},
|
||||
strings.Repeat("0", 17): {931, 931, 931, 931, 931, 410},
|
||||
}
|
||||
|
||||
for s, want := range cases {
|
||||
ids, err := tokenizer.Encode(s)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(want, ids); diff != "" {
|
||||
t.Errorf("%q no match (-theirs +ours):\n%s", s, diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("basic roundtrip", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []string{
|
||||
"hello",
|
||||
"hello ",
|
||||
"hello ",
|
||||
" hello",
|
||||
" hello ",
|
||||
" hello ",
|
||||
"hello world",
|
||||
"请考试我的软件!12345",
|
||||
}
|
||||
|
||||
for _, want := range cases {
|
||||
ids, err := tokenizer.Encode(want)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if got, err := tokenizer.Decode(ids); err != nil {
|
||||
t.Fatal(err)
|
||||
} else if got != want {
|
||||
t.Errorf("got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("special", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := map[string][]int32{
|
||||
"<|begin_of_text|>A B!": {128000, 32, 426, 0},
|
||||
"<|begin_of_text|>A<|end_of_text|>B!": {128000, 32, 128001, 33, 0},
|
||||
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!": {128000, 32, 128001, 33, 128000, 0},
|
||||
"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
|
||||
}
|
||||
|
||||
for s, want := range cases {
|
||||
ids, err := tokenizer.Encode(s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(want, ids); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("split", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := map[string][]string{
|
||||
"Hello World!": {"Hello", " World", "!"},
|
||||
"I'm don't won't": {"I", "'m", " don", "'t", " won", "'t"},
|
||||
"In 2024 there are 366 days": {"In", " ", "202", "4", " there", " are", " ", "366", " days"},
|
||||
"Hello!! ...world": {"Hello", "!!", " ...", "world"},
|
||||
"Hello World": {"Hello", " ", " World"},
|
||||
"Hello\nWorld": {"Hello", "\n", "World"},
|
||||
"Hello, WORLD!! How's it going?": {"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
|
||||
}
|
||||
|
||||
for s, want := range cases {
|
||||
got := slices.Collect(tokenizer.split(s))
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Errorf("no match (-theirs +ours):\n%s", diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func Benchmark(b *testing.B) {
|
||||
tokenizer := llama(b)
|
||||
bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
for i := range 8 {
|
||||
n := min(int(math.Pow10(i)), len(bts))
|
||||
bts := bts[:n]
|
||||
b.Run("encode"+strconv.Itoa(n), func(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_, err := tokenizer.Encode(string(bts))
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("decode"+strconv.Itoa(n), func(b *testing.B) {
|
||||
ids, err := tokenizer.Encode(string(bts))
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_, err := tokenizer.Decode(ids)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
586
model/testdata/inputs.json
vendored
586
model/testdata/inputs.json
vendored
@ -1,586 +0,0 @@
|
||||
[
|
||||
{
|
||||
"base64": "aWVkIDQgwr0gbW9udGhz",
|
||||
"ids": [
|
||||
1142,
|
||||
220,
|
||||
19,
|
||||
220,
|
||||
27154,
|
||||
4038
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "RsO8aHJlcg==",
|
||||
"ids": [
|
||||
37,
|
||||
51853,
|
||||
261
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "",
|
||||
"ids": []
|
||||
},
|
||||
{
|
||||
"base64": "IA==",
|
||||
"ids": [
|
||||
220
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICA=",
|
||||
"ids": [
|
||||
256
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAg",
|
||||
"ids": [
|
||||
262
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CQ==",
|
||||
"ids": [
|
||||
197
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Cg==",
|
||||
"ids": [
|
||||
198
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Cgo=",
|
||||
"ids": [
|
||||
271
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CgoK",
|
||||
"ids": [
|
||||
1432
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CQo=",
|
||||
"ids": [
|
||||
1602
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8gd29ybGQ=",
|
||||
"ids": [
|
||||
9906,
|
||||
1917
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvIHdvcmxk",
|
||||
"ids": [
|
||||
22691,
|
||||
1917
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8gV29ybGQ=",
|
||||
"ids": [
|
||||
9906,
|
||||
4435
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvIFdvcmxk",
|
||||
"ids": [
|
||||
22691,
|
||||
4435
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvIFdvcmxkIQ==",
|
||||
"ids": [
|
||||
22691,
|
||||
4435,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8sIHdvcmxkIQ==",
|
||||
"ids": [
|
||||
9906,
|
||||
11,
|
||||
1917,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvLCB3b3JsZCE=",
|
||||
"ids": [
|
||||
22691,
|
||||
11,
|
||||
1917,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IHRoaXMgaXMg8J+mmS5jcHA=",
|
||||
"ids": [
|
||||
420,
|
||||
374,
|
||||
11410,
|
||||
99,
|
||||
247,
|
||||
13,
|
||||
11055
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "dzA0OCA3dHVpamsgZHNkZmh1",
|
||||
"ids": [
|
||||
86,
|
||||
23904,
|
||||
220,
|
||||
22,
|
||||
83,
|
||||
2005,
|
||||
42908,
|
||||
11729,
|
||||
3013,
|
||||
17156
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "0L3QtdGJ0L4g0L3QsCDQkdGK0LvQs9Cw0YDRgdC60Lg=",
|
||||
"ids": [
|
||||
79862,
|
||||
102118,
|
||||
13373,
|
||||
64571,
|
||||
34694,
|
||||
3114,
|
||||
112203,
|
||||
80112
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "4Z6A4Z624Z6T4Z+L4Z6P4Z+C4Z6W4Z634Z6f4Z+B4Z6f4Z6i4Z624Z6F4Z6B4Z6b4Z6F4Z+B4Z6J",
|
||||
"ids": [
|
||||
21549,
|
||||
222,
|
||||
98629,
|
||||
241,
|
||||
45358,
|
||||
233,
|
||||
21549,
|
||||
237,
|
||||
45358,
|
||||
224,
|
||||
21549,
|
||||
244,
|
||||
21549,
|
||||
115,
|
||||
21549,
|
||||
253,
|
||||
45358,
|
||||
223,
|
||||
21549,
|
||||
253,
|
||||
21549,
|
||||
95,
|
||||
98629,
|
||||
227,
|
||||
21549,
|
||||
223,
|
||||
21549,
|
||||
249,
|
||||
21549,
|
||||
227,
|
||||
45358,
|
||||
223,
|
||||
21549,
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "8J+agCAobm9ybWFsKSDwn5i24oCN8J+Mq++4jyAobXVsdGlwbGUgZW1vamlzIGNvbmNhdGVuYXRlZCkg4pyFIChvbmx5IGVtb2ppIHRoYXQgaGFzIGl0cyBvd24gdG9rZW4p",
|
||||
"ids": [
|
||||
9468,
|
||||
248,
|
||||
222,
|
||||
320,
|
||||
8416,
|
||||
8,
|
||||
27623,
|
||||
114,
|
||||
102470,
|
||||
9468,
|
||||
234,
|
||||
104,
|
||||
31643,
|
||||
320,
|
||||
36773,
|
||||
100166,
|
||||
98634,
|
||||
8,
|
||||
26602,
|
||||
227,
|
||||
320,
|
||||
3323,
|
||||
43465,
|
||||
430,
|
||||
706,
|
||||
1202,
|
||||
1866,
|
||||
4037,
|
||||
8
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8=",
|
||||
"ids": [
|
||||
9906
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxv",
|
||||
"ids": [
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICBIZWxsbw==",
|
||||
"ids": [
|
||||
220,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAgSGVsbG8=",
|
||||
"ids": [
|
||||
256,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAgIEhlbGxv",
|
||||
"ids": [
|
||||
262,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAgIEhlbGxvCiAgICBIZWxsbw==",
|
||||
"ids": [
|
||||
262,
|
||||
22691,
|
||||
198,
|
||||
262,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICg=",
|
||||
"ids": [
|
||||
320
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CiA9",
|
||||
"ids": [
|
||||
198,
|
||||
284
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "JyBlcmE=",
|
||||
"ids": [
|
||||
6,
|
||||
11639
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8sIHknYWxsISBIb3cgYXJlIHlvdSDwn5iBID/miJHmg7PlnKhhcHBsZeW3peS9nDEzMTQxNTHlpKnvvZ4=",
|
||||
"ids": [
|
||||
9906,
|
||||
11,
|
||||
379,
|
||||
65948,
|
||||
0,
|
||||
2650,
|
||||
527,
|
||||
499,
|
||||
27623,
|
||||
223,
|
||||
949,
|
||||
37046,
|
||||
101067,
|
||||
19000,
|
||||
23182,
|
||||
102301,
|
||||
9263,
|
||||
18136,
|
||||
16,
|
||||
36827,
|
||||
21909
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ISEhISEh",
|
||||
"ids": [
|
||||
17523,
|
||||
3001
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Mw==",
|
||||
"ids": [
|
||||
18
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzM=",
|
||||
"ids": [
|
||||
1644
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMz",
|
||||
"ids": [
|
||||
8765
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMw==",
|
||||
"ids": [
|
||||
8765,
|
||||
18
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzM=",
|
||||
"ids": [
|
||||
8765,
|
||||
1644
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMz",
|
||||
"ids": [
|
||||
8765,
|
||||
8765
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMzMw==",
|
||||
"ids": [
|
||||
8765,
|
||||
8765,
|
||||
18
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMzMzM=",
|
||||
"ids": [
|
||||
8765,
|
||||
8765,
|
||||
1644
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMzMzMz",
|
||||
"ids": [
|
||||
8765,
|
||||
8765,
|
||||
8765
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Q+G7rWEgVmnhu4d0",
|
||||
"ids": [
|
||||
34,
|
||||
91163,
|
||||
101798
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IGRpc2NhcmRz",
|
||||
"ids": [
|
||||
2624,
|
||||
2402
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CiAKCiAKCgogCSAJCSAJCiAgCiAgIAogICAgCiAgICAgCvCfmoAgKG5vcm1hbCkg8J+YtuKAjfCfjKvvuI8gKG11bHRpcGxlIGVtb2ppcyBjb25jYXRlbmF0ZWQpIOKchSDwn6aZ8J+mmSAzIDMzIDMzMyAzMzMzIDMzMzMzIDMzMzMzMyAzMzMzMzMzIDMzMzMzMzMzIDMuMyAzLi4zIDMuLi4zIOGegOGetuGek+Gfi+Gej+GfguGeluGet+Gen+GfgeGen+GeouGetuGehfCfmIEgP+aIkeaDs+WcqGFwcGxl5bel5L2cMTMxNDE1MeWkqe+9niAtLS0tLS09PT09PT09INC90LXRidC+INC90LAg0JHRitC70LPQsNGA0YHQutC4ICcnJycnJ2BgYGBgYGAiIiIiLi4uLi4uISEhISEhPz8/Pz8/IEkndmUgYmVlbiAndG9sZCBoZSdzIHRoZXJlLCAnUkUgeW91IHN1cmU/ICdNIG5vdCBzdXJlIEknbGwgbWFrZSBpdCwgJ0QgeW91IGxpa2Ugc29tZSB0ZWE/IFdlJ1ZlIGEnbEw=",
|
||||
"ids": [
|
||||
198,
|
||||
4815,
|
||||
15073,
|
||||
66597,
|
||||
8004,
|
||||
1602,
|
||||
2355,
|
||||
79772,
|
||||
11187,
|
||||
9468,
|
||||
248,
|
||||
222,
|
||||
320,
|
||||
8416,
|
||||
8,
|
||||
27623,
|
||||
114,
|
||||
102470,
|
||||
9468,
|
||||
234,
|
||||
104,
|
||||
31643,
|
||||
320,
|
||||
36773,
|
||||
100166,
|
||||
98634,
|
||||
8,
|
||||
26602,
|
||||
227,
|
||||
11410,
|
||||
99,
|
||||
247,
|
||||
9468,
|
||||
99,
|
||||
247,
|
||||
220,
|
||||
18,
|
||||
220,
|
||||
1644,
|
||||
220,
|
||||
8765,
|
||||
220,
|
||||
8765,
|
||||
18,
|
||||
220,
|
||||
8765,
|
||||
1644,
|
||||
220,
|
||||
8765,
|
||||
8765,
|
||||
220,
|
||||
8765,
|
||||
8765,
|
||||
18,
|
||||
220,
|
||||
8765,
|
||||
8765,
|
||||
1644,
|
||||
220,
|
||||
18,
|
||||
13,
|
||||
18,
|
||||
220,
|
||||
18,
|
||||
497,
|
||||
18,
|
||||
220,
|
||||
18,
|
||||
1131,
|
||||
18,
|
||||
220,
|
||||
21549,
|
||||
222,
|
||||
98629,
|
||||
241,
|
||||
45358,
|
||||
233,
|
||||
21549,
|
||||
237,
|
||||
45358,
|
||||
224,
|
||||
21549,
|
||||
244,
|
||||
21549,
|
||||
115,
|
||||
21549,
|
||||
253,
|
||||
45358,
|
||||
223,
|
||||
21549,
|
||||
253,
|
||||
21549,
|
||||
95,
|
||||
98629,
|
||||
227,
|
||||
76460,
|
||||
223,
|
||||
949,
|
||||
37046,
|
||||
101067,
|
||||
19000,
|
||||
23182,
|
||||
102301,
|
||||
9263,
|
||||
18136,
|
||||
16,
|
||||
36827,
|
||||
21909,
|
||||
56560,
|
||||
54337,
|
||||
19175,
|
||||
102118,
|
||||
13373,
|
||||
64571,
|
||||
34694,
|
||||
3114,
|
||||
112203,
|
||||
80112,
|
||||
3436,
|
||||
106451,
|
||||
14196,
|
||||
14196,
|
||||
74694,
|
||||
3089,
|
||||
3089,
|
||||
29249,
|
||||
17523,
|
||||
3001,
|
||||
27708,
|
||||
7801,
|
||||
358,
|
||||
3077,
|
||||
1027,
|
||||
364,
|
||||
83,
|
||||
820,
|
||||
568,
|
||||
596,
|
||||
1070,
|
||||
11,
|
||||
364,
|
||||
793,
|
||||
499,
|
||||
2771,
|
||||
30,
|
||||
364,
|
||||
44,
|
||||
539,
|
||||
2771,
|
||||
358,
|
||||
3358,
|
||||
1304,
|
||||
433,
|
||||
11,
|
||||
364,
|
||||
35,
|
||||
499,
|
||||
1093,
|
||||
1063,
|
||||
15600,
|
||||
30,
|
||||
1226,
|
||||
6,
|
||||
43712,
|
||||
264,
|
||||
64966,
|
||||
43
|
||||
]
|
||||
}
|
||||
]
|
128002
model/testdata/llama3.2/encoder.json
vendored
Normal file
128002
model/testdata/llama3.2/encoder.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
280147
model/testdata/llama3.2/vocab.bpe
vendored
Normal file
280147
model/testdata/llama3.2/vocab.bpe
vendored
Normal file
File diff suppressed because it is too large
Load Diff
63845
model/testdata/war-and-peace.txt
vendored
Normal file
63845
model/testdata/war-and-peace.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user