sample: temporarily use grammars for constrained generation in new engine (#9586)

2025-12-11 07:22:24 +01:00 · 2025-03-10 16:17:39 +01:00
parent a1cda80bcb
commit e093db92c4
10 changed files with 301 additions and 213 deletions
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -245,6 +245,20 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 	return &m, nil
 }

+func LoadVocabFromFile(path string) (*Vocab, error) {
+	mp := C.CString(path)
+	defer C.free(unsafe.Pointer(mp))
+	v := Vocab{c: C.llama_load_vocab_from_file(mp)}
+	if v.c == nil {
+		return nil, fmt.Errorf("unable to load vocab: %s", path)
+	}
+	return &v, nil
+}
+
+func FreeVocab(vocab *Vocab) {
+	C.llama_free_vocab(vocab.c)
+}
+
 func FreeModel(model *Model) {
 	C.llama_model_free(model.c)
 }
@@ -293,6 +307,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 	return nil
 }

+type Vocab struct {
+	c *C.struct_llama_vocab
+}
+
 func (m *Model) Vocab() *C.struct_llama_vocab {
 	return C.llama_model_get_vocab(m.c)
 }
@@ -669,3 +687,53 @@ func SchemaToGrammar(schema []byte) []byte {
 	}
 	return buf[:n]
 }
+
+type Sampler struct {
+	c *C.struct_llama_sampler
+}
+
+func NewGrammarSampler(vocab *Vocab, grammar string) *Sampler {
+	cGrammar := C.CString(grammar)
+	cRoot := C.CString("root")
+	defer C.free(unsafe.Pointer(cGrammar))
+	defer C.free(unsafe.Pointer(cRoot))
+
+	sampler := &Sampler{c: C.llama_sampler_init_grammar(vocab.c, cGrammar, cRoot)}
+
+	return sampler
+}
+
+func (s *Sampler) Accept(token int32) {
+	C.llama_sampler_accept(s.c, C.llama_token(token))
+}
+
+type TokenData struct {
+	Id    int32
+	Logit float32
+}
+
+func (s *Sampler) Apply(tokens []TokenData) {
+	tds := make([]C.struct_llama_token_data, len(tokens))
+	for i, token := range tokens {
+		tds[i] = C.struct_llama_token_data{
+			id:    C.int32_t(token.Id),
+			logit: C.float(token.Logit),
+			p:     C.float(0.0),
+		}
+	}
+	tda := &C.llama_token_data_array{
+		data:     (*C.struct_llama_token_data)(unsafe.Pointer(&tds[0])),
+		size:     C.size_t(len(tokens)),
+		selected: C.int64_t(-1),
+		sorted:   C.bool(false),
+	}
+
+	var pinner runtime.Pinner
+	pinner.Pin(&tds[0])
+	defer pinner.Unpin()
+
+	C.llama_sampler_apply(s.c, tda)
+	for i := range tokens {
+		tokens[i].Logit = float32(tds[i].logit)
+	}
+}
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -2,6 +2,9 @@
 #include "sampling.h"
 #include "sampling_ext.h"
 #include "json-schema-to-grammar.h"
+#include "llama.h"
+#include "llama-model.h"
+#include "llama-model-loader.h"

 struct common_sampler *common_sampler_cinit(const struct llama_model *model, struct common_sampler_cparams *params) {
    try {
@@ -64,3 +67,22 @@ int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len)
        return 0;
    }
 }
+
+struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
+    llama_vocab * vocab = new llama_vocab();
+    try {
+        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
+        std::vector<std::string> splits = {};
+        llama_model_loader ml(std::string(fname), splits, false, false, nullptr);
+        vocab->load(ml, kv);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+        return nullptr;
+    }
+
+    return vocab;
+}
+
+void llama_free_vocab(struct llama_vocab * vocab) {
+    delete vocab;
+}
--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
@@ -35,6 +35,9 @@ extern "C"

    int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len);

+    struct llama_vocab * llama_load_vocab_from_file(const char * fname);
+    void llama_free_vocab(struct llama_vocab * vocab);
+
 #ifdef __cplusplus
 }
 #endif
--- a/llm/server.go
+++ b/llm/server.go
@@ -729,29 +729,24 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}

 	if len(req.Format) > 0 {
-		format := string(req.Format)
-		if format != `null` && format != `""` {
-			if s.textProcessor != nil {
-				// New engine handles this on the backend
-				request["format"] = req.Format
-			} else {
-				// old engine
-				switch format {
-				case `"json"`:
-					request["grammar"] = grammarJSON
-				default:
-					if req.Format[0] != '{' {
-						return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
-					}
-
-					// User provided a JSON schema
-					g := llama.SchemaToGrammar(req.Format)
-					if g == nil {
-						return fmt.Errorf("invalid JSON schema in format")
-					}
-					request["grammar"] = string(g)
-				}
+		switch string(req.Format) {
+		case `null`, `""`:
+			// Field was set, but "missing" a value. We accept
+			// these as "not set".
+			break
+		case `"json"`:
+			request["grammar"] = grammarJSON
+		default:
+			if req.Format[0] != '{' {
+				return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
 			}
+
+			// User provided a JSON schema
+			g := llama.SchemaToGrammar(req.Format)
+			if g == nil {
+				return fmt.Errorf("invalid JSON schema in format")
+			}
+			request["grammar"] = string(g)
 		}
 	}

--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -254,6 +254,12 @@ type Server struct {
 	// multimodalHash generates hashes for comparing equality
 	// of non-text data
 	multimodalHash maphash.Hash
+
+	// vocab is a llama.cpp vocab required for gammar-based
+	// constrained generation (json mode, structured outputs)
+	// TODO: this is temporary until Ollama sampling supports
+	// constrained generation
+	vocab *sample.Vocab
 }

 func (s *Server) allNil() bool {
@@ -574,18 +580,25 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	var grammar *sample.Grammar
+	var err error
+	if req.Grammar != "" {
+		grammar, err = sample.NewGrammar(s.vocab, req.Grammar)
+		if err != nil {
+			http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
+			return
+		}
+	}
+
 	sampler := sample.NewSampler(
 		req.Temperature,
 		req.TopK,
 		req.TopP,
 		req.MinP,
 		req.Seed,
+		grammar,
 	)

-	if req.Grammar != "" {
-		panic("grammars are not yet supported")
-	}
-
 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
 		numPredict: req.NumPredict,
 		stop:       req.Stop,
@@ -797,6 +810,8 @@ func (s *Server) loadModel(
 		panic(err)
 	}

+	s.vocab = sample.NewVocab(mpath)
+
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -2,43 +2,88 @@ package sample

 import (
 	"errors"
+	"math"
 	"math/rand/v2"
 	"slices"
+	"sync"
+
+	"github.com/ollama/ollama/llama"
 )

-// Sampler is not thread-safe. Each goroutine should have its own instance
-type Sampler interface {
-	Sample([]float32) (int32, error)
-}
-
-// logit represents information about a single token during sampling
-type logit struct {
+// token represents information about a single token during sampling
+type token struct {
 	id    int32   // The token's unique identifier
 	value float32 // The raw logit or probability from the model
 }

-type weighted struct {
+type Sampler struct {
 	rng         *rand.Rand
-	tokens      []logit
 	topK        int
 	topP        float32
 	minP        float32
 	temperature float32
+	grammar     *Grammar
 }

-func (s *weighted) Sample(logits []float32) (int32, error) {
-	if len(s.tokens) < len(logits) {
-		s.tokens = make([]logit, len(logits))
-	}
-
-	tokens := s.tokens[:len(logits)]
-
-	for i, v := range logits {
+func (s *Sampler) Sample(logits []float32) (int32, error) {
+	tokens := make([]token, len(logits))
+	for i := range logits {
 		tokens[i].id = int32(i)
-		tokens[i].value = v
+		tokens[i].value = logits[i]
+	}
+
+	t, err := s.sample(tokens)
+	if err != nil {
+		return -1, err
+	}
+
+	if s.grammar != nil {
+		// optimization: first check if the max logit is accepted by the grammar
+		// if the max logit is rejected, apply the grammar to all logits (slower)
+		top := []token{t}
+		s.grammar.Apply(top)
+		if !math.IsInf(float64(top[0].value), -1) {
+			s.grammar.Accept(top[0].id)
+			return top[0].id, nil
+		}
+
+		// since .sample has side effects of modifying the tokens
+		// we need to reset them before applying the grammar and
+		// sampling again
+		for i := range logits {
+			tokens[i].id = int32(i)
+			tokens[i].value = logits[i]
+		}
+		s.grammar.Apply(tokens)
+		t, err = s.sample(tokens)
+		if err != nil {
+			return -1, err
+		}
+		s.grammar.Accept(t.id)
+	}
+
+	return t.id, nil
+}
+
+// greedy returns the highest probability token from the tokens
+func greedy(tokens []token) token {
+	max := tokens[0]
+	for i := 1; i < len(tokens); i++ {
+		if tokens[i].value > max.value {
+			max = tokens[i]
+		}
+	}
+
+	return max
+}
+
+// sample returns the highest probability token from the tokens
+// given sampler parameters. It also has side effects of modifying the tokens
+func (s *Sampler) sample(tokens []token) (token, error) {
+	if s.temperature == 0 {
+		return greedy(tokens), nil
 	}

-	// Tokens are sorted by logits in TopK or SortTokens
 	if s.topK > 0 {
 		tokens = topK(tokens, s.topK)
 	} else {
@@ -47,12 +92,14 @@ func (s *weighted) Sample(logits []float32) (int32, error) {

 	tokens = temperature(tokens, s.temperature)
 	tokens = softmax(tokens)
-
 	tokens = topP(tokens, s.topP)
 	tokens = minP(tokens, s.minP)

+	// TODO: this should fall back to greedy sampling
+	// or topP, topK values etc should be such that
+	// there are always tokens to sample from
 	if len(tokens) == 0 {
-		return -1, errors.New("no valid logits found for weighted sampling")
+		return token{}, errors.New("no tokens to sample from")
 	}

 	var r float32
@@ -70,48 +117,18 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
 	}
 	r *= tokens[len(tokens)-1].value

-	idx, _ := slices.BinarySearchFunc(tokens, r, func(token logit, target float32) int {
-		// Compare cumulative probabilities
+	idx, _ := slices.BinarySearchFunc(tokens, r, func(token token, target float32) int {
 		if token.value < target {
 			return -1
 		}
-		// First token that exceeds target
 		return 1
 	})

-	if idx >= len(tokens) {
-		idx = len(tokens) - 1
-	}
-
-	return tokens[idx].id, nil
-}
-
-type greedy struct{}
-
-// Greedy sample returns the index of the maximum value in logits.
-func (s greedy) Sample(logits []float32) (int32, error) {
-	if len(logits) == 0 {
-		return -1, errors.New("no logits provided for greedy sampling")
-	}
-
-	maxIdx := 0
-	maxVal := logits[0]
-	for i := 1; i < len(logits); i++ {
-		if logits[i] > maxVal {
-			maxVal = logits[i]
-			maxIdx = i
-		}
-	}
-
-	return int32(maxIdx), nil
+	return tokens[idx], nil
 }

 // TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
-func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int) Sampler {
-	if temperature == 0 {
-		return &greedy{}
-	}
-
+func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
 	var rng *rand.Rand
 	if seed != -1 {
 		// PCG requires two parameters: sequence and stream
@@ -120,7 +137,9 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 		// Use golden ratio hash to generate statistically independent seeds
 		rng = rand.New(rand.NewPCG(sequence, sequence^0x9E3779B9))
 	}
-	temperature = max(temperature, 1)
+	if temperature < 0.0 {
+		temperature = 0.0
+	}

 	if topP < 0.0 {
 		topP = 0.0
@@ -136,11 +155,73 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 		minP = 1.0
 	}

-	return &weighted{
+	return Sampler{
 		rng:         rng,
 		topK:        topK,
 		topP:        topP,
 		minP:        minP,
 		temperature: temperature,
+		grammar:     grammar,
 	}
 }
+
+type Grammar struct {
+	vocab   *Vocab
+	grammar string
+	sampler *llama.Sampler
+}
+
+func NewGrammar(vocab *Vocab, grammar string) (*Grammar, error) {
+	v, err := vocab.Load()
+	if err != nil {
+		return nil, err
+	}
+
+	return &Grammar{
+		vocab:   vocab,
+		grammar: grammar,
+		sampler: llama.NewGrammarSampler(v, grammar),
+	}, nil
+}
+
+func (g *Grammar) Apply(tokens []token) {
+	tds := make([]llama.TokenData, len(tokens))
+	for i, token := range tokens {
+		tds[i].Id = token.id
+		tds[i].Logit = token.value
+	}
+
+	g.sampler.Apply(tds)
+
+	for i := range tokens {
+		tokens[i].value = tds[i].Logit
+	}
+}
+
+func (g *Grammar) Accept(token int32) {
+	g.sampler.Accept(token)
+}
+
+type Vocab struct {
+	once  sync.Once
+	vocab *llama.Vocab
+	err   error
+	path  string
+}
+
+func NewVocab(path string) *Vocab {
+	return &Vocab{path: path}
+}
+
+// Load returns the lazily-loaded vocabulary
+func (v *Vocab) Load() (*llama.Vocab, error) {
+	v.once.Do(func() {
+		vocab, err := llama.LoadVocabFromFile(v.path)
+		if err != nil {
+			v.err = err
+			return
+		}
+		v.vocab = vocab
+	})
+	return v.vocab, v.err
+}
--- a/sample/samplers_benchmark_test.go
+++ b/sample/samplers_benchmark_test.go
@@ -16,13 +16,10 @@ func BenchmarkWeightedSampler(b *testing.B) {
 				logits[i] = float32(rand.Float64()*10 - 5)
 			}

-			sampler := NewSampler(0.8, 0, 0, 0, 42)
+			sampler := NewSampler(0.8, 0, 0, 0, 42, nil)
 			b.ResetTimer()
 			for b.Loop() {
-				_, err := sampler.Sample(logits)
-				if err != nil {
-					b.Fatalf("Sampling failed: %v", err)
-				}
+				sampler.Sample(logits)
 			}
 		})
 	}
@@ -52,30 +49,24 @@ func BenchmarkWeightedSampler(b *testing.B) {

 	for _, tc := range configs {
 		b.Run("Config"+tc.name, func(b *testing.B) {
-			sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed)
+			sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed, nil)
 			sampler.Sample(logits)

 			b.ResetTimer()

 			for b.Loop() {
-				_, err := sampler.Sample(logits)
-				if err != nil {
-					b.Fatalf("Sampling failed: %v", err)
-				}
+				sampler.Sample(logits)
 			}
 		})
 	}

 	// Test with combined transforms separately - topK influences performance greatly
 	b.Run("TransformCombined", func(b *testing.B) {
-		sampler := NewSampler(0.8, 50, 0.9, 0.05, 42)
+		sampler := NewSampler(0.8, 50, 0.9, 0.05, 42, nil)
 		b.ResetTimer()

 		for b.Loop() {
-			_, err := sampler.Sample(logits)
-			if err != nil {
-				b.Fatalf("Sampling failed: %v", err)
-			}
+			sampler.Sample(logits)
 		}
 	})
 }
@@ -90,14 +81,11 @@ func BenchmarkGreedySampler(b *testing.B) {
 				logits[i] = float32(rand.Float64()*10 - 5)
 			}

-			sampler := NewSampler(0, -1, 0, 0, -1)
+			sampler := NewSampler(0, -1, 0, 0, -1, nil)
 			b.ResetTimer()

 			for b.Loop() {
-				_, err := sampler.Sample(logits)
-				if err != nil {
-					b.Fatalf("Sampling failed: %v", err)
-				}
+				sampler.Sample(logits)
 			}
 		})
 	}
--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -7,7 +7,7 @@ import (

 func TestWeighted(t *testing.T) {
 	logits := []float32{-10, 3, -10, -10}
-	sampler := NewSampler(0, 0, 0, 0, 0)
+	sampler := NewSampler(0, 0, 0, 0, 0, nil)
 	got, err := sampler.Sample(logits)
 	if err != nil {
 		t.Error(err)
@@ -19,7 +19,7 @@ func TestWeighted(t *testing.T) {
 	}

 	logits = []float32{-100, -10, 0, 10}
-	sampler = NewSampler(0, 0, 0, 0, 0)
+	sampler = NewSampler(0, 0, 0, 0, 0, nil)
 	got, err = sampler.Sample(logits)
 	if err != nil {
 		t.Error(err)
@@ -31,94 +31,10 @@ func TestWeighted(t *testing.T) {
 	}
 }

-func TestNewSampler(t *testing.T) {
-	tests := []struct {
-		name        string
-		temperature float32
-		topK        int
-		topP        float32
-		minP        float32
-		seed        int
-		wantGreedy  bool // Instead of wantErr, check if we get greedy sampler
-	}{
-		{
-			name:        "temperature",
-			temperature: 0.5,
-			wantGreedy:  false,
-		},
-		{
-			name:        "zero temperature - greedy",
-			temperature: 0,
-			wantGreedy:  true,
-		},
-		{
-			name:        "top k",
-			temperature: 0.1,
-			topK:        10,
-			wantGreedy:  false,
-		},
-		{
-			name:        "top p",
-			temperature: 0.1,
-			topP:        0.9,
-			wantGreedy:  false,
-		},
-		{
-			name:        "min p",
-			temperature: 0.1,
-			minP:        0.2,
-			wantGreedy:  false,
-		},
-		{
-			name:        "seed - weighted",
-			temperature: 0.1,
-			seed:        42,
-			wantGreedy:  false,
-		},
-		{
-			name:        "default values",
-			temperature: 0.8,
-			topK:        40,
-			topP:        0.9,
-			minP:        0.0,
-			seed:        0,
-			wantGreedy:  false,
-		},
-		{
-			name:        "all zeroes - greedy",
-			temperature: 0.0,
-			topK:        0,
-			topP:        0.0,
-			minP:        0.0,
-			seed:        0,
-			wantGreedy:  true,
-		},
-		{
-			name:        "all transforms",
-			temperature: 0.8,
-			topK:        50,
-			topP:        0.95,
-			minP:        0.1,
-			seed:        42,
-			wantGreedy:  false,
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			sampler := NewSampler(tt.temperature, tt.topK, tt.topP, tt.minP, tt.seed)
-			_, isGreedy := sampler.(*greedy)
-			if isGreedy != tt.wantGreedy {
-				t.Errorf("NewSampler() got greedy = %v, want %v", isGreedy, tt.wantGreedy)
-			}
-		})
-	}
-}
-
 func BenchmarkSample(b *testing.B) {
-	weighted := NewSampler(0.5, 10, 0.9, 0.2, -1)
 	samplers := map[string]Sampler{
-		"Greedy":   NewSampler(0, 0, 0, 0, 0), // Use NewSampler with temp=0 for greedy
-		"Weighted": weighted,
+		"Greedy":   NewSampler(0, 0, 0, 0, 0, nil), // Use NewSampler with temp=0 for greedy
+		"Weighted": NewSampler(0.5, 10, 0.9, 0.2, -1, nil),
 	}

 	// Generate random logits for benchmarking
@@ -132,7 +48,7 @@ func BenchmarkSample(b *testing.B) {
 			b.ResetTimer()
 			for b.Loop() {
 				if _, err := s.Sample(logits); err != nil {
-					b.Error(err)
+					b.Fatalf("error sampling: %v", err)
 				}
 			}
 		})
--- a/sample/transforms.go
+++ b/sample/transforms.go
@@ -5,7 +5,7 @@ import (
 	"slices"
 )

-func softmax(ts []logit) []logit {
+func softmax(ts []token) []token {
 	var sum float32
 	for i, v := range ts {
 		ts[i].value = float32(math.Exp(float64(v.value)))
@@ -19,7 +19,7 @@ func softmax(ts []logit) []logit {
 	return ts
 }

-func temperature(ti []logit, t float32) []logit {
+func temperature(ti []token, t float32) []token {
 	if t == 1 {
 		return ti
 	}
@@ -51,7 +51,7 @@ func temperature(ti []logit, t float32) []logit {
 // 1. Finds the smallest value between the node and its children
 // 2. If the node is not the smallest, swaps it with its smallest child
 // 3. Continues this process down the affected path until the min-heap property is restored
-func siftDown(data []logit, start, end int) {
+func siftDown(data []token, start, end int) {
 	root := start
 	for {
 		child := 2*root + 1
@@ -73,7 +73,7 @@ func siftDown(data []logit, start, end int) {
 }

 // topK limits the number of tokens considered to the k highest logits
-func topK(ts []logit, k int) []logit {
+func topK(ts []token, k int) []token {
 	if k >= len(ts) {
 		return ts
 	}
@@ -99,7 +99,7 @@ func topK(ts []logit, k int) []logit {
 }

 // topP limits tokens to those with cumulative probability p
-func topP(ts []logit, p float32) []logit {
+func topP(ts []token, p float32) []token {
 	if p == 1.0 {
 		return ts
 	}
@@ -118,7 +118,7 @@ func topP(ts []logit, p float32) []logit {
 }

 // minP limits tokens to those with cumulative probability p
-func minP(ts []logit, p float32) []logit {
+func minP(ts []token, p float32) []token {
 	if p == 1.0 {
 		return ts
 	}
@@ -146,7 +146,7 @@ func minP(ts []logit, p float32) []logit {

 // TODO(parthsareen): possibly replace with simpler implementation https://github.com/ollama/ollama/issues/9584
 // Conting sort implementation to sort tokens by logits
-func sortLogits(tokens []logit) {
+func sortLogits(tokens []token) {
 	if len(tokens) <= 1 {
 		return
 	}
@@ -187,7 +187,7 @@ func sortLogits(tokens []logit) {
 	}

 	// Second pass: place elements in correct position
-	output := make([]logit, len(tokens))
+	output := make([]token, len(tokens))
 	// Track current positions
 	countsCopy := counts

--- a/sample/transforms_test.go
+++ b/sample/transforms_test.go
@@ -7,10 +7,10 @@ import (
 )

 // Helper to convert float64 slice to logit slice
-func toLogits(values []float64) []logit {
-	tokens := make([]logit, len(values))
+func toTokens(values []float64) []token {
+	tokens := make([]token, len(values))
 	for i, v := range values {
-		tokens[i] = logit{
+		tokens[i] = token{
 			id:    int32(i),
 			value: float32(v),
 		}
@@ -19,7 +19,7 @@ func toLogits(values []float64) []logit {
 }

 // Helper to compare logit slices
-func compareLogits(t *testing.T, name string, want []float64, got []logit) {
+func compareLogits(t *testing.T, name string, want []float64, got []token) {
 	t.Helper()
 	if len(want) != len(got) {
 		t.Errorf("%s: length mismatch: want %d, got %d", name, len(want), len(got))
@@ -36,13 +36,13 @@ func TestTemperature(t *testing.T) {
 	input := []float64{2, -1, 4, -3, 1, -2, 0}
 	want := []float64{-4, -10, 0, -14, -6, -12, -8} // (logit - max logit) / temp

-	got := temperature(toLogits(input), 0.5)
+	got := temperature(toTokens(input), 0.5)
 	compareLogits(t, "Temperature", want, got)
 }

 func TestSoftmax(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4}
-	got := softmax(toLogits(input))
+	got := softmax(toTokens(input))

 	// Check probabilities sum to 1
 	var sum float32
@@ -65,7 +65,7 @@ func TestTopK(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4}

 	// Test k=3
-	got := topK(toLogits(input), 3)
+	got := topK(toTokens(input), 3)
 	if len(got) != 3 {
 		t.Errorf("topK(3): wrong length: want 3, got %d", len(got))
 	}
@@ -74,13 +74,13 @@ func TestTopK(t *testing.T) {
 	compareLogits(t, "topK(3)", want, got)

 	// Test k > len
-	got = topK(toLogits(input), 10)
+	got = topK(toTokens(input), 10)
 	compareLogits(t, "topK(10)", input, got)
 }

 func TestTopP(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4}
-	tokens := toLogits(input)
+	tokens := toTokens(input)

 	// First apply temperature and softmax to get probabilities
 	tokens = temperature(tokens, 1)
@@ -99,7 +99,7 @@ func TestTopP(t *testing.T) {

 func TestMinP(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4, 3}
-	tokens := toLogits(input)
+	tokens := toTokens(input)

 	// First apply temperature and softmax
 	tokens = temperature(tokens, 1)
@@ -116,7 +116,7 @@ func TestMinP(t *testing.T) {

 func TestSortLogits(t *testing.T) {
 	input := []float64{3, 1, 4, 2, -1, 0, -2}
-	tokens := toLogits(input)
+	tokens := toTokens(input)

 	sortLogits(tokens)

@@ -133,15 +133,15 @@ func TestSortLogits(t *testing.T) {

 func BenchmarkTransforms(b *testing.B) {
 	// Generate random logits
-	tokens := make([]logit, 1<<16)
+	tokens := make([]token, 1<<16)
 	for i := range tokens {
-		tokens[i] = logit{
+		tokens[i] = token{
 			id:    int32(i),
 			value: rand.Float32(),
 		}
 	}

-	tokensCopy := make([]logit, len(tokens))
+	tokensCopy := make([]token, len(tokens))

 	b.Run("Temperature", func(b *testing.B) {
 		b.ResetTimer()