refactor prcess text tests

2025-04-21 22:15:04 +02:00 · 2025-01-29 15:29:30 -08:00 · 2025-01-29 15:29:30 -08:00 · 109ad1da0f
commit 109ad1da0f
parent 624bfb0b11
12 changed files with 472282 additions and 731 deletions
--- a/model/llama/model.go
+++ b/model/llama/model.go
@ -29,16 +29,16 @@ type Model struct {

 func New(c ml.Config) (model.Model, error) {
 	return &Model{
-		BytePairEncoding: model.BytePairEncoding{
-			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			Vocabulary: &model.Vocabulary{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Uints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
 				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
 				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
 			},
-		},
+		),
 		Layers: make([]Layer, c.Uint("block_count")),
 		Options: &Options{
 			hiddenSize: int64(c.Uint("embedding_length")),
--- a/model/mllama/model.go
+++ b/model/mllama/model.go
@ -8,6 +8,7 @@ import (

 type Model struct {
 	model.Base
+	model.BytePairEncoding

 	*VisionModel `gguf:"v,vision"`
 	*TextModel
@ -15,14 +16,22 @@ type Model struct {
 	Projector *nn.Linear `gguf:"mm.0"`

 	ImageProcessor
-	TextProcessor
 }

 func New(c ml.Config) (model.Model, error) {
 	return &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
+			},
+		),
 		ImageProcessor: newImageProcessor(c),
 		VisionModel:    newVisionModel(c),
-		TextProcessor:  newTextProcessor(c),
 		TextModel:      newTextModel(c),
 	}, nil
 }
--- a/model/mllama/process_text.go
+++ b/model/mllama/process_text.go
@ -1,25 +0,0 @@
-package mllama
-
-import (
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model"
-)
-
-type TextProcessor struct {
-	model.BytePairEncoding
-}
-
-func newTextProcessor(c ml.Config) TextProcessor {
-	return TextProcessor{
-		BytePairEncoding: model.BytePairEncoding{
-			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
-			Vocabulary: &model.Vocabulary{
-				Values: c.Strings("tokenizer.ggml.tokens"),
-				Types:  c.Uints("tokenizer.ggml.token_type"),
-				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
-				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
-			},
-		},
-	}
-}
--- a/model/mllama/process_text_test.go
+++ b/model/mllama/process_text_test.go
@ -1,87 +0,0 @@
-package mllama
-
-import (
-	"encoding/json"
-	"errors"
-	"os"
-	"path/filepath"
-	"strconv"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/google/go-cmp/cmp/cmpopts"
-
-	"github.com/ollama/ollama/model"
-)
-
-func TestProcessText(t *testing.T) {
-	ours, err := model.New(filepath.Join("testdata", "model.bin"))
-	if errors.Is(err, os.ErrNotExist) {
-		t.Skip("no model.bin")
-	} else if err != nil {
-		t.Fatal(err)
-	}
-
-	t.Run("decode", func(t *testing.T) {
-		f, err := os.Open(filepath.Join("testdata", "theirs.json"))
-		if errors.Is(err, os.ErrNotExist) {
-			t.Skip("no theirs.json")
-		} else if err != nil {
-			t.Fatal(err)
-		}
-		defer f.Close()
-
-		var theirs [][]byte
-		if err := json.NewDecoder(f).Decode(&theirs); err != nil {
-			t.Fatal(err)
-		}
-
-		for id := range theirs {
-			ids := []int32{int32(id)}
-			s, err := ours.(model.TextProcessor).Decode(ids)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			if diff := cmp.Diff(string(theirs[id]), s); diff != "" {
-				t.Errorf("%d no match (-theirs +ours):\n%s", id, diff)
-			}
-		}
-	})
-
-	t.Run("encode", func(t *testing.T) {
-		f, err := os.Open(filepath.Join("..", "testdata", "inputs.json"))
-		if errors.Is(err, os.ErrNotExist) {
-			t.Skip("no inputs.json")
-		} else if err != nil {
-			t.Fatal(err)
-		}
-		defer f.Close()
-
-		var inputs []struct {
-			Values []byte  `json:"base64"`
-			IDs    []int32 `json:"ids"`
-		}
-
-		if err := json.NewDecoder(f).Decode(&inputs); err != nil {
-			t.Fatal(err)
-		}
-
-		for i, input := range inputs {
-			if i == 45 {
-				t.Skip("skip 45")
-			}
-
-			t.Run(strconv.Itoa(i), func(t *testing.T) {
-				ids, err := ours.(model.TextProcessor).Encode(string(input.Values))
-				if err != nil {
-					t.Fatal(err)
-				}
-
-				if diff := cmp.Diff(input.IDs, ids, cmpopts.EquateEmpty()); diff != "" {
-					t.Errorf("%s: no match (-theirs +ours):\n%s", input.Values, diff)
-				}
-			})
-		}
-	})
-}
--- a/model/mllama/testdata/model.bin
+++ b/model/mllama/testdata/model.bin
@ -1 +0,0 @@
-/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf
--- a/model/mllama/testdata/theirs.json
+++ b/model/mllama/testdata/theirs.json
--- a/model/process_text.go
+++ b/model/process_text.go
@ -2,6 +2,7 @@ package model

 import (
 	"cmp"
+	"iter"
 	"log/slog"
 	"strings"
 	"sync"
@ -99,23 +100,29 @@ func (v *Vocabulary) Merge(left, right string) int {
 }

 type BytePairEncoding struct {
-	Pretokenizer string
-
-	*Vocabulary
+	pre   *regexp2.Regexp
+	vocab *Vocabulary
 }

-func (bpe BytePairEncoding) split(s string) ([]string, error) {
-	re, err := regexp2.Compile(bpe.Pretokenizer, regexp2.Unicode|regexp2.RE2)
-	if err != nil {
-		return nil, err
+func NewBytePairEncoding(pre string, vocab *Vocabulary) BytePairEncoding {
+	return BytePairEncoding{
+		pre:   regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
+		vocab: vocab,
 	}
+}

-	var matches []string
-	for m, _ := re.FindStringMatch(s); m != nil; m, _ = re.FindNextMatch(m) {
-		matches = append(matches, m.String())
+func (bpe BytePairEncoding) Is(id uint32, special Special) bool {
+	return bpe.vocab.Is(id, special)
+}
+
+func (bpe *BytePairEncoding) split(s string) iter.Seq[string] {
+	return func(yield func(string) bool) {
+		for m, _ := bpe.pre.FindStringMatch(s); m != nil; m, _ = bpe.pre.FindNextMatch(m) {
+			if !yield(m.String()) {
+				break
+			}
+		}
 	}
-
-	return matches, nil
 }

 // fragment is a string fragment and their corresponding token IDs
@ -138,9 +145,9 @@ type merge struct {

 func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 	fragments := []fragment{{value: s}}
-	for _, special := range bpe.Vocabulary.SpecialVocabulary() {
+	for _, special := range bpe.vocab.SpecialVocabulary() {
 		// TODO: process special tokens concurrently
-		id := bpe.Vocabulary.Encode(special)
+		id := bpe.vocab.Encode(special)
 		for i := 0; i < len(fragments); i++ {
 			frag := fragments[i]
 			if len(frag.ids) > 0 {
@ -173,13 +180,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 			continue
 		}

-		// split fragment using pretokenizer
-		splits, err := bpe.split(frag.value)
-		if err != nil {
-			return nil, err
-		}
-
-		for _, split := range splits {
+		for split := range bpe.split(frag.value) {
 			// TODO: process splits concurrently
 			var sb strings.Builder
 			for _, b := range []byte(split) {
@ -197,7 +198,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 			}

 			// short circuit if the fragment is in the vocabulary
-			if id := bpe.Vocabulary.Encode(sb.String()); id >= 0 {
+			if id := bpe.vocab.Encode(sb.String()); id >= 0 {
 				ids = append(ids, id)
 				slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
 				continue
@ -219,7 +220,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 				}

 				left, right := string(merges[a].runes), string(merges[b].runes)
-				rank := bpe.Vocabulary.Merge(left, right)
+				rank := bpe.vocab.Merge(left, right)
 				if rank < 0 {
 					return nil
 				}
@ -271,7 +272,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 			for _, merge := range merges {
 				if len(merge.runes) > 0 {
 					// TODO: handle the edge case where the rune isn't in the vocabulary
-					if id := bpe.Vocabulary.Encode(string(merge.runes)); id >= 0 {
+					if id := bpe.vocab.Encode(string(merge.runes)); id >= 0 {
 						ids = append(ids, id)
 						slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
 					}
@ -286,7 +287,7 @@ func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
 func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 	var sb strings.Builder
 	for _, id := range ids {
-		for _, r := range bpe.Vocabulary.Decode(id) {
+		for _, r := range bpe.vocab.Decode(id) {
 			switch {
 			case r == 0x0100:
 				// this produces 0x00 aka NULL
--- a/model/process_text_test.go
+++ b/model/process_text_test.go
@ -0,0 +1,247 @@
+package model
+
+import (
+	"bufio"
+	"encoding/json"
+	"math"
+	"os"
+	"path/filepath"
+	"slices"
+	"strconv"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func llama(t testing.TB) BytePairEncoding {
+	t.Helper()
+
+	f, err := os.Open(filepath.Join("testdata", "llama3.2", "encoder.json"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	vocab := make(map[string]int32)
+	if err := json.NewDecoder(f).Decode(&vocab); err != nil {
+		t.Fatal(err)
+	}
+
+	types := make([]uint32, len(vocab))
+	tokens := make([]string, len(vocab))
+	for token, id := range vocab {
+		tokens[id] = token
+		types[id] = 1
+	}
+
+	for _, token := range []string{"<|begin_of_text|>", "<|end_of_text|>"} {
+		if _, ok := vocab[token]; !ok {
+			tokens = append(tokens, token) //nolint:makezero
+			types = append(types, 3)       //nolint:makezero
+			vocab[token] = int32(len(vocab))
+		}
+	}
+
+	f, err = os.Open(filepath.Join("testdata", "llama3.2", "vocab.bpe"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer f.Close()
+
+	merges := make([]string, 0, 50000)
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		if !strings.HasPrefix(scanner.Text(), "#") {
+			merges = append(merges, scanner.Text())
+		}
+	}
+
+	return NewBytePairEncoding(
+		`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		&Vocabulary{
+			Values: tokens,
+			Types:  types,
+			Merges: merges,
+		},
+	)
+}
+
+func TestLlama(t *testing.T) {
+	tokenizer := llama(t)
+
+	t.Run("simple", func(t *testing.T) {
+		t.Parallel()
+
+		ids, err := tokenizer.Encode("hello world")
+		if err != nil {
+			t.Error(err)
+		}
+
+		if diff := cmp.Diff([]int32{15339, 1917}, ids); diff != "" {
+			t.Errorf("no match (-theirs +ours):\n%s", diff)
+		}
+
+		s, err := tokenizer.Decode([]int32{15339, 1917})
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if s != "hello world" {
+			t.Errorf("got %q, want hello world", s)
+		}
+
+		ids, err = tokenizer.Encode("hello <|end_of_text|>")
+		if err != nil {
+			t.Error(err)
+		}
+
+		if diff := cmp.Diff([]int32{15339, 220, 128001}, ids); diff != "" {
+			t.Errorf("no match (-theirs +ours):\n%s", diff)
+		}
+	})
+
+	t.Run("simple repeated", func(t *testing.T) {
+		t.Parallel()
+
+		cases := map[string][]int32{
+			strings.Repeat("0", 1):  {15},
+			strings.Repeat("0", 2):  {410},
+			strings.Repeat("0", 3):  {931},
+			strings.Repeat("0", 4):  {931, 15},
+			strings.Repeat("0", 5):  {931, 410},
+			strings.Repeat("0", 6):  {931, 931},
+			strings.Repeat("0", 7):  {931, 931, 15},
+			strings.Repeat("0", 8):  {931, 931, 410},
+			strings.Repeat("0", 9):  {931, 931, 931},
+			strings.Repeat("0", 10): {931, 931, 931, 15},
+			strings.Repeat("0", 11): {931, 931, 931, 410},
+			strings.Repeat("0", 12): {931, 931, 931, 931},
+			strings.Repeat("0", 13): {931, 931, 931, 931, 15},
+			strings.Repeat("0", 14): {931, 931, 931, 931, 410},
+			strings.Repeat("0", 15): {931, 931, 931, 931, 931},
+			strings.Repeat("0", 16): {931, 931, 931, 931, 931, 15},
+			strings.Repeat("0", 17): {931, 931, 931, 931, 931, 410},
+		}
+
+		for s, want := range cases {
+			ids, err := tokenizer.Encode(s)
+			if err != nil {
+				t.Error(err)
+			}
+
+			if diff := cmp.Diff(want, ids); diff != "" {
+				t.Errorf("%q no match (-theirs +ours):\n%s", s, diff)
+			}
+		}
+	})
+
+	t.Run("basic roundtrip", func(t *testing.T) {
+		t.Parallel()
+
+		cases := []string{
+			"hello",
+			"hello ",
+			"hello  ",
+			" hello",
+			" hello ",
+			" hello  ",
+			"hello world",
+			"请考试我的软件！12345",
+		}
+
+		for _, want := range cases {
+			ids, err := tokenizer.Encode(want)
+			if err != nil {
+				t.Error(err)
+			}
+
+			if got, err := tokenizer.Decode(ids); err != nil {
+				t.Fatal(err)
+			} else if got != want {
+				t.Errorf("got %q, want %q", got, want)
+			}
+		}
+	})
+
+	t.Run("special", func(t *testing.T) {
+		t.Parallel()
+
+		cases := map[string][]int32{
+			"<|begin_of_text|>A B!":                                               {128000, 32, 426, 0},
+			"<|begin_of_text|>A<|end_of_text|>B!":                                 {128000, 32, 128001, 33, 0},
+			"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!":                {128000, 32, 128001, 33, 128000, 0},
+			"<|begin_of_text|>A<|end_of_text|>B<|begin_of_text|>!<|end_of_text|>": {128000, 32, 128001, 33, 128000, 0, 128001},
+		}
+
+		for s, want := range cases {
+			ids, err := tokenizer.Encode(s)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if diff := cmp.Diff(want, ids); diff != "" {
+				t.Errorf("no match (-theirs +ours):\n%s", diff)
+			}
+		}
+	})
+
+	t.Run("split", func(t *testing.T) {
+		t.Parallel()
+
+		cases := map[string][]string{
+			"Hello World!":                   {"Hello", " World", "!"},
+			"I'm don't won't":                {"I", "'m", " don", "'t", " won", "'t"},
+			"In 2024 there are 366 days":     {"In", " ", "202", "4", " there", " are", " ", "366", " days"},
+			"Hello!! ...world":               {"Hello", "!!", " ...", "world"},
+			"Hello    World":                 {"Hello", "   ", " World"},
+			"Hello\nWorld":                   {"Hello", "\n", "World"},
+			"Hello, WORLD!! How's it going?": {"Hello", ",", " WORLD", "!!", " How", "'s", " it", " going", "?"},
+		}
+
+		for s, want := range cases {
+			got := slices.Collect(tokenizer.split(s))
+			if diff := cmp.Diff(want, got); diff != "" {
+				t.Errorf("no match (-theirs +ours):\n%s", diff)
+			}
+		}
+	})
+}
+
+func Benchmark(b *testing.B) {
+	tokenizer := llama(b)
+	bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	for i := range 8 {
+		n := min(int(math.Pow10(i)), len(bts))
+		bts := bts[:n]
+		b.Run("encode"+strconv.Itoa(n), func(b *testing.B) {
+			b.ResetTimer()
+			for range b.N {
+				_, err := tokenizer.Encode(string(bts))
+				if err != nil {
+					b.Fatal(err)
+				}
+			}
+		})
+
+		b.Run("decode"+strconv.Itoa(n), func(b *testing.B) {
+			ids, err := tokenizer.Encode(string(bts))
+			if err != nil {
+				b.Fatal(err)
+			}
+
+			b.ResetTimer()
+			for range b.N {
+				_, err := tokenizer.Decode(ids)
+				if err != nil {
+					b.Fatal(err)
+				}
+			}
+		})
+	}
+}
--- a/model/testdata/inputs.json
+++ b/model/testdata/inputs.json
@ -1,586 +0,0 @@
-[
-    {
-        "base64": "aWVkIDQgwr0gbW9udGhz",
-        "ids": [
-            1142,
-            220,
-            19,
-            220,
-            27154,
-            4038
-        ]
-    },
-    {
-        "base64": "RsO8aHJlcg==",
-        "ids": [
-            37,
-            51853,
-            261
-        ]
-    },
-    {
-        "base64": "",
-        "ids": []
-    },
-    {
-        "base64": "IA==",
-        "ids": [
-            220
-        ]
-    },
-    {
-        "base64": "ICA=",
-        "ids": [
-            256
-        ]
-    },
-    {
-        "base64": "ICAg",
-        "ids": [
-            262
-        ]
-    },
-    {
-        "base64": "CQ==",
-        "ids": [
-            197
-        ]
-    },
-    {
-        "base64": "Cg==",
-        "ids": [
-            198
-        ]
-    },
-    {
-        "base64": "Cgo=",
-        "ids": [
-            271
-        ]
-    },
-    {
-        "base64": "CgoK",
-        "ids": [
-            1432
-        ]
-    },
-    {
-        "base64": "CQo=",
-        "ids": [
-            1602
-        ]
-    },
-    {
-        "base64": "SGVsbG8gd29ybGQ=",
-        "ids": [
-            9906,
-            1917
-        ]
-    },
-    {
-        "base64": "IEhlbGxvIHdvcmxk",
-        "ids": [
-            22691,
-            1917
-        ]
-    },
-    {
-        "base64": "SGVsbG8gV29ybGQ=",
-        "ids": [
-            9906,
-            4435
-        ]
-    },
-    {
-        "base64": "IEhlbGxvIFdvcmxk",
-        "ids": [
-            22691,
-            4435
-        ]
-    },
-    {
-        "base64": "IEhlbGxvIFdvcmxkIQ==",
-        "ids": [
-            22691,
-            4435,
-            0
-        ]
-    },
-    {
-        "base64": "SGVsbG8sIHdvcmxkIQ==",
-        "ids": [
-            9906,
-            11,
-            1917,
-            0
-        ]
-    },
-    {
-        "base64": "IEhlbGxvLCB3b3JsZCE=",
-        "ids": [
-            22691,
-            11,
-            1917,
-            0
-        ]
-    },
-    {
-        "base64": "IHRoaXMgaXMg8J+mmS5jcHA=",
-        "ids": [
-            420,
-            374,
-            11410,
-            99,
-            247,
-            13,
-            11055
-        ]
-    },
-    {
-        "base64": "dzA0OCA3dHVpamsgZHNkZmh1",
-        "ids": [
-            86,
-            23904,
-            220,
-            22,
-            83,
-            2005,
-            42908,
-            11729,
-            3013,
-            17156
-        ]
-    },
-    {
-        "base64": "0L3QtdGJ0L4g0L3QsCDQkdGK0LvQs9Cw0YDRgdC60Lg=",
-        "ids": [
-            79862,
-            102118,
-            13373,
-            64571,
-            34694,
-            3114,
-            112203,
-            80112
-        ]
-    },
-    {
-        "base64": "4Z6A4Z624Z6T4Z+L4Z6P4Z+C4Z6W4Z634Z6f4Z+B4Z6f4Z6i4Z624Z6F4Z6B4Z6b4Z6F4Z+B4Z6J",
-        "ids": [
-            21549,
-            222,
-            98629,
-            241,
-            45358,
-            233,
-            21549,
-            237,
-            45358,
-            224,
-            21549,
-            244,
-            21549,
-            115,
-            21549,
-            253,
-            45358,
-            223,
-            21549,
-            253,
-            21549,
-            95,
-            98629,
-            227,
-            21549,
-            223,
-            21549,
-            249,
-            21549,
-            227,
-            45358,
-            223,
-            21549,
-            231
-        ]
-    },
-    {
-        "base64": "8J+agCAobm9ybWFsKSDwn5i24oCN8J+Mq++4jyAobXVsdGlwbGUgZW1vamlzIGNvbmNhdGVuYXRlZCkg4pyFIChvbmx5IGVtb2ppIHRoYXQgaGFzIGl0cyBvd24gdG9rZW4p",
-        "ids": [
-            9468,
-            248,
-            222,
-            320,
-            8416,
-            8,
-            27623,
-            114,
-            102470,
-            9468,
-            234,
-            104,
-            31643,
-            320,
-            36773,
-            100166,
-            98634,
-            8,
-            26602,
-            227,
-            320,
-            3323,
-            43465,
-            430,
-            706,
-            1202,
-            1866,
-            4037,
-            8
-        ]
-    },
-    {
-        "base64": "SGVsbG8=",
-        "ids": [
-            9906
-        ]
-    },
-    {
-        "base64": "IEhlbGxv",
-        "ids": [
-            22691
-        ]
-    },
-    {
-        "base64": "ICBIZWxsbw==",
-        "ids": [
-            220,
-            22691
-        ]
-    },
-    {
-        "base64": "ICAgSGVsbG8=",
-        "ids": [
-            256,
-            22691
-        ]
-    },
-    {
-        "base64": "ICAgIEhlbGxv",
-        "ids": [
-            262,
-            22691
-        ]
-    },
-    {
-        "base64": "ICAgIEhlbGxvCiAgICBIZWxsbw==",
-        "ids": [
-            262,
-            22691,
-            198,
-            262,
-            22691
-        ]
-    },
-    {
-        "base64": "ICg=",
-        "ids": [
-            320
-        ]
-    },
-    {
-        "base64": "CiA9",
-        "ids": [
-            198,
-            284
-        ]
-    },
-    {
-        "base64": "JyBlcmE=",
-        "ids": [
-            6,
-            11639
-        ]
-    },
-    {
-        "base64": "SGVsbG8sIHknYWxsISBIb3cgYXJlIHlvdSDwn5iBID/miJHmg7PlnKhhcHBsZeW3peS9nDEzMTQxNTHlpKnvvZ4=",
-        "ids": [
-            9906,
-            11,
-            379,
-            65948,
-            0,
-            2650,
-            527,
-            499,
-            27623,
-            223,
-            949,
-            37046,
-            101067,
-            19000,
-            23182,
-            102301,
-            9263,
-            18136,
-            16,
-            36827,
-            21909
-        ]
-    },
-    {
-        "base64": "ISEhISEh",
-        "ids": [
-            17523,
-            3001
-        ]
-    },
-    {
-        "base64": "Mw==",
-        "ids": [
-            18
-        ]
-    },
-    {
-        "base64": "MzM=",
-        "ids": [
-            1644
-        ]
-    },
-    {
-        "base64": "MzMz",
-        "ids": [
-            8765
-        ]
-    },
-    {
-        "base64": "MzMzMw==",
-        "ids": [
-            8765,
-            18
-        ]
-    },
-    {
-        "base64": "MzMzMzM=",
-        "ids": [
-            8765,
-            1644
-        ]
-    },
-    {
-        "base64": "MzMzMzMz",
-        "ids": [
-            8765,
-            8765
-        ]
-    },
-    {
-        "base64": "MzMzMzMzMw==",
-        "ids": [
-            8765,
-            8765,
-            18
-        ]
-    },
-    {
-        "base64": "MzMzMzMzMzM=",
-        "ids": [
-            8765,
-            8765,
-            1644
-        ]
-    },
-    {
-        "base64": "MzMzMzMzMzMz",
-        "ids": [
-            8765,
-            8765,
-            8765
-        ]
-    },
-    {
-        "base64": "Q+G7rWEgVmnhu4d0",
-        "ids": [
-            34,
-            91163,
-            101798
-        ]
-    },
-    {
-        "base64": "IGRpc2NhcmRz",
-        "ids": [
-            2624,
-            2402
-        ]
-    },
-    {
-        "base64": "CiAKCiAKCgogCSAJCSAJCiAgCiAgIAogICAgCiAgICAgCvCfmoAgKG5vcm1hbCkg8J+YtuKAjfCfjKvvuI8gKG11bHRpcGxlIGVtb2ppcyBjb25jYXRlbmF0ZWQpIOKchSDwn6aZ8J+mmSAzIDMzIDMzMyAzMzMzIDMzMzMzIDMzMzMzMyAzMzMzMzMzIDMzMzMzMzMzIDMuMyAzLi4zIDMuLi4zIOGegOGetuGek+Gfi+Gej+GfguGeluGet+Gen+GfgeGen+GeouGetuGehfCfmIEgP+aIkeaDs+WcqGFwcGxl5bel5L2cMTMxNDE1MeWkqe+9niAtLS0tLS09PT09PT09INC90LXRidC+INC90LAg0JHRitC70LPQsNGA0YHQutC4ICcnJycnJ2BgYGBgYGAiIiIiLi4uLi4uISEhISEhPz8/Pz8/IEkndmUgYmVlbiAndG9sZCBoZSdzIHRoZXJlLCAnUkUgeW91IHN1cmU/ICdNIG5vdCBzdXJlIEknbGwgbWFrZSBpdCwgJ0QgeW91IGxpa2Ugc29tZSB0ZWE/IFdlJ1ZlIGEnbEw=",
-        "ids": [
-            198,
-            4815,
-            15073,
-            66597,
-            8004,
-            1602,
-            2355,
-            79772,
-            11187,
-            9468,
-            248,
-            222,
-            320,
-            8416,
-            8,
-            27623,
-            114,
-            102470,
-            9468,
-            234,
-            104,
-            31643,
-            320,
-            36773,
-            100166,
-            98634,
-            8,
-            26602,
-            227,
-            11410,
-            99,
-            247,
-            9468,
-            99,
-            247,
-            220,
-            18,
-            220,
-            1644,
-            220,
-            8765,
-            220,
-            8765,
-            18,
-            220,
-            8765,
-            1644,
-            220,
-            8765,
-            8765,
-            220,
-            8765,
-            8765,
-            18,
-            220,
-            8765,
-            8765,
-            1644,
-            220,
-            18,
-            13,
-            18,
-            220,
-            18,
-            497,
-            18,
-            220,
-            18,
-            1131,
-            18,
-            220,
-            21549,
-            222,
-            98629,
-            241,
-            45358,
-            233,
-            21549,
-            237,
-            45358,
-            224,
-            21549,
-            244,
-            21549,
-            115,
-            21549,
-            253,
-            45358,
-            223,
-            21549,
-            253,
-            21549,
-            95,
-            98629,
-            227,
-            76460,
-            223,
-            949,
-            37046,
-            101067,
-            19000,
-            23182,
-            102301,
-            9263,
-            18136,
-            16,
-            36827,
-            21909,
-            56560,
-            54337,
-            19175,
-            102118,
-            13373,
-            64571,
-            34694,
-            3114,
-            112203,
-            80112,
-            3436,
-            106451,
-            14196,
-            14196,
-            74694,
-            3089,
-            3089,
-            29249,
-            17523,
-            3001,
-            27708,
-            7801,
-            358,
-            3077,
-            1027,
-            364,
-            83,
-            820,
-            568,
-            596,
-            1070,
-            11,
-            364,
-            793,
-            499,
-            2771,
-            30,
-            364,
-            44,
-            539,
-            2771,
-            358,
-            3358,
-            1304,
-            433,
-            11,
-            364,
-            35,
-            499,
-            1093,
-            1063,
-            15600,
-            30,
-            1226,
-            6,
-            43712,
-            264,
-            64966,
-            43
-        ]
-    }
-]
--- a/model/testdata/llama3.2/encoder.json
+++ b/model/testdata/llama3.2/encoder.json
--- a/model/testdata/llama3.2/vocab.bpe
+++ b/model/testdata/llama3.2/vocab.bpe
--- a/model/testdata/war-and-peace.txt
+++ b/model/testdata/war-and-peace.txt
				`@ -1 +0,0 @@`
				`/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf`