sample: temporarily use grammars for constrained generation in new engine (#9586)

2025-11-10 19:37:31 +01:00 · 2025-03-10 16:17:39 +01:00
parent a1cda80bcb
commit e093db92c4
10 changed files with 301 additions and 213 deletions
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -2,43 +2,88 @@ package sample

 import (
 	"errors"
+	"math"
 	"math/rand/v2"
 	"slices"
+	"sync"
+
+	"github.com/ollama/ollama/llama"
 )

-// Sampler is not thread-safe. Each goroutine should have its own instance
-type Sampler interface {
-	Sample([]float32) (int32, error)
-}
-
-// logit represents information about a single token during sampling
-type logit struct {
+// token represents information about a single token during sampling
+type token struct {
 	id    int32   // The token's unique identifier
 	value float32 // The raw logit or probability from the model
 }

-type weighted struct {
+type Sampler struct {
 	rng         *rand.Rand
-	tokens      []logit
 	topK        int
 	topP        float32
 	minP        float32
 	temperature float32
+	grammar     *Grammar
 }

-func (s *weighted) Sample(logits []float32) (int32, error) {
-	if len(s.tokens) < len(logits) {
-		s.tokens = make([]logit, len(logits))
-	}
-
-	tokens := s.tokens[:len(logits)]
-
-	for i, v := range logits {
+func (s *Sampler) Sample(logits []float32) (int32, error) {
+	tokens := make([]token, len(logits))
+	for i := range logits {
 		tokens[i].id = int32(i)
-		tokens[i].value = v
+		tokens[i].value = logits[i]
+	}
+
+	t, err := s.sample(tokens)
+	if err != nil {
+		return -1, err
+	}
+
+	if s.grammar != nil {
+		// optimization: first check if the max logit is accepted by the grammar
+		// if the max logit is rejected, apply the grammar to all logits (slower)
+		top := []token{t}
+		s.grammar.Apply(top)
+		if !math.IsInf(float64(top[0].value), -1) {
+			s.grammar.Accept(top[0].id)
+			return top[0].id, nil
+		}
+
+		// since .sample has side effects of modifying the tokens
+		// we need to reset them before applying the grammar and
+		// sampling again
+		for i := range logits {
+			tokens[i].id = int32(i)
+			tokens[i].value = logits[i]
+		}
+		s.grammar.Apply(tokens)
+		t, err = s.sample(tokens)
+		if err != nil {
+			return -1, err
+		}
+		s.grammar.Accept(t.id)
+	}
+
+	return t.id, nil
+}
+
+// greedy returns the highest probability token from the tokens
+func greedy(tokens []token) token {
+	max := tokens[0]
+	for i := 1; i < len(tokens); i++ {
+		if tokens[i].value > max.value {
+			max = tokens[i]
+		}
+	}
+
+	return max
+}
+
+// sample returns the highest probability token from the tokens
+// given sampler parameters. It also has side effects of modifying the tokens
+func (s *Sampler) sample(tokens []token) (token, error) {
+	if s.temperature == 0 {
+		return greedy(tokens), nil
 	}

-	// Tokens are sorted by logits in TopK or SortTokens
 	if s.topK > 0 {
 		tokens = topK(tokens, s.topK)
 	} else {
@@ -47,12 +92,14 @@ func (s *weighted) Sample(logits []float32) (int32, error) {

 	tokens = temperature(tokens, s.temperature)
 	tokens = softmax(tokens)
-
 	tokens = topP(tokens, s.topP)
 	tokens = minP(tokens, s.minP)

+	// TODO: this should fall back to greedy sampling
+	// or topP, topK values etc should be such that
+	// there are always tokens to sample from
 	if len(tokens) == 0 {
-		return -1, errors.New("no valid logits found for weighted sampling")
+		return token{}, errors.New("no tokens to sample from")
 	}

 	var r float32
@@ -70,48 +117,18 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
 	}
 	r *= tokens[len(tokens)-1].value

-	idx, _ := slices.BinarySearchFunc(tokens, r, func(token logit, target float32) int {
-		// Compare cumulative probabilities
+	idx, _ := slices.BinarySearchFunc(tokens, r, func(token token, target float32) int {
 		if token.value < target {
 			return -1
 		}
-		// First token that exceeds target
 		return 1
 	})

-	if idx >= len(tokens) {
-		idx = len(tokens) - 1
-	}
-
-	return tokens[idx].id, nil
-}
-
-type greedy struct{}
-
-// Greedy sample returns the index of the maximum value in logits.
-func (s greedy) Sample(logits []float32) (int32, error) {
-	if len(logits) == 0 {
-		return -1, errors.New("no logits provided for greedy sampling")
-	}
-
-	maxIdx := 0
-	maxVal := logits[0]
-	for i := 1; i < len(logits); i++ {
-		if logits[i] > maxVal {
-			maxVal = logits[i]
-			maxIdx = i
-		}
-	}
-
-	return int32(maxIdx), nil
+	return tokens[idx], nil
 }

 // TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
-func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int) Sampler {
-	if temperature == 0 {
-		return &greedy{}
-	}
-
+func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
 	var rng *rand.Rand
 	if seed != -1 {
 		// PCG requires two parameters: sequence and stream
@@ -120,7 +137,9 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 		// Use golden ratio hash to generate statistically independent seeds
 		rng = rand.New(rand.NewPCG(sequence, sequence^0x9E3779B9))
 	}
-	temperature = max(temperature, 1)
+	if temperature < 0.0 {
+		temperature = 0.0
+	}

 	if topP < 0.0 {
 		topP = 0.0
@@ -136,11 +155,73 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 		minP = 1.0
 	}

-	return &weighted{
+	return Sampler{
 		rng:         rng,
 		topK:        topK,
 		topP:        topP,
 		minP:        minP,
 		temperature: temperature,
+		grammar:     grammar,
 	}
 }
+
+type Grammar struct {
+	vocab   *Vocab
+	grammar string
+	sampler *llama.Sampler
+}
+
+func NewGrammar(vocab *Vocab, grammar string) (*Grammar, error) {
+	v, err := vocab.Load()
+	if err != nil {
+		return nil, err
+	}
+
+	return &Grammar{
+		vocab:   vocab,
+		grammar: grammar,
+		sampler: llama.NewGrammarSampler(v, grammar),
+	}, nil
+}
+
+func (g *Grammar) Apply(tokens []token) {
+	tds := make([]llama.TokenData, len(tokens))
+	for i, token := range tokens {
+		tds[i].Id = token.id
+		tds[i].Logit = token.value
+	}
+
+	g.sampler.Apply(tds)
+
+	for i := range tokens {
+		tokens[i].value = tds[i].Logit
+	}
+}
+
+func (g *Grammar) Accept(token int32) {
+	g.sampler.Accept(token)
+}
+
+type Vocab struct {
+	once  sync.Once
+	vocab *llama.Vocab
+	err   error
+	path  string
+}
+
+func NewVocab(path string) *Vocab {
+	return &Vocab{path: path}
+}
+
+// Load returns the lazily-loaded vocabulary
+func (v *Vocab) Load() (*llama.Vocab, error) {
+	v.once.Do(func() {
+		vocab, err := llama.LoadVocabFromFile(v.path)
+		if err != nil {
+			v.err = err
+			return
+		}
+		v.vocab = vocab
+	})
+	return v.vocab, v.err
+}
--- a/sample/samplers_benchmark_test.go
+++ b/sample/samplers_benchmark_test.go
@@ -16,13 +16,10 @@ func BenchmarkWeightedSampler(b *testing.B) {
 				logits[i] = float32(rand.Float64()*10 - 5)
 			}

-			sampler := NewSampler(0.8, 0, 0, 0, 42)
+			sampler := NewSampler(0.8, 0, 0, 0, 42, nil)
 			b.ResetTimer()
 			for b.Loop() {
-				_, err := sampler.Sample(logits)
-				if err != nil {
-					b.Fatalf("Sampling failed: %v", err)
-				}
+				sampler.Sample(logits)
 			}
 		})
 	}
@@ -52,30 +49,24 @@ func BenchmarkWeightedSampler(b *testing.B) {

 	for _, tc := range configs {
 		b.Run("Config"+tc.name, func(b *testing.B) {
-			sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed)
+			sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed, nil)
 			sampler.Sample(logits)

 			b.ResetTimer()

 			for b.Loop() {
-				_, err := sampler.Sample(logits)
-				if err != nil {
-					b.Fatalf("Sampling failed: %v", err)
-				}
+				sampler.Sample(logits)
 			}
 		})
 	}

 	// Test with combined transforms separately - topK influences performance greatly
 	b.Run("TransformCombined", func(b *testing.B) {
-		sampler := NewSampler(0.8, 50, 0.9, 0.05, 42)
+		sampler := NewSampler(0.8, 50, 0.9, 0.05, 42, nil)
 		b.ResetTimer()

 		for b.Loop() {
-			_, err := sampler.Sample(logits)
-			if err != nil {
-				b.Fatalf("Sampling failed: %v", err)
-			}
+			sampler.Sample(logits)
 		}
 	})
 }
@@ -90,14 +81,11 @@ func BenchmarkGreedySampler(b *testing.B) {
 				logits[i] = float32(rand.Float64()*10 - 5)
 			}

-			sampler := NewSampler(0, -1, 0, 0, -1)
+			sampler := NewSampler(0, -1, 0, 0, -1, nil)
 			b.ResetTimer()

 			for b.Loop() {
-				_, err := sampler.Sample(logits)
-				if err != nil {
-					b.Fatalf("Sampling failed: %v", err)
-				}
+				sampler.Sample(logits)
 			}
 		})
 	}
--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -7,7 +7,7 @@ import (

 func TestWeighted(t *testing.T) {
 	logits := []float32{-10, 3, -10, -10}
-	sampler := NewSampler(0, 0, 0, 0, 0)
+	sampler := NewSampler(0, 0, 0, 0, 0, nil)
 	got, err := sampler.Sample(logits)
 	if err != nil {
 		t.Error(err)
@@ -19,7 +19,7 @@ func TestWeighted(t *testing.T) {
 	}

 	logits = []float32{-100, -10, 0, 10}
-	sampler = NewSampler(0, 0, 0, 0, 0)
+	sampler = NewSampler(0, 0, 0, 0, 0, nil)
 	got, err = sampler.Sample(logits)
 	if err != nil {
 		t.Error(err)
@@ -31,94 +31,10 @@ func TestWeighted(t *testing.T) {
 	}
 }

-func TestNewSampler(t *testing.T) {
-	tests := []struct {
-		name        string
-		temperature float32
-		topK        int
-		topP        float32
-		minP        float32
-		seed        int
-		wantGreedy  bool // Instead of wantErr, check if we get greedy sampler
-	}{
-		{
-			name:        "temperature",
-			temperature: 0.5,
-			wantGreedy:  false,
-		},
-		{
-			name:        "zero temperature - greedy",
-			temperature: 0,
-			wantGreedy:  true,
-		},
-		{
-			name:        "top k",
-			temperature: 0.1,
-			topK:        10,
-			wantGreedy:  false,
-		},
-		{
-			name:        "top p",
-			temperature: 0.1,
-			topP:        0.9,
-			wantGreedy:  false,
-		},
-		{
-			name:        "min p",
-			temperature: 0.1,
-			minP:        0.2,
-			wantGreedy:  false,
-		},
-		{
-			name:        "seed - weighted",
-			temperature: 0.1,
-			seed:        42,
-			wantGreedy:  false,
-		},
-		{
-			name:        "default values",
-			temperature: 0.8,
-			topK:        40,
-			topP:        0.9,
-			minP:        0.0,
-			seed:        0,
-			wantGreedy:  false,
-		},
-		{
-			name:        "all zeroes - greedy",
-			temperature: 0.0,
-			topK:        0,
-			topP:        0.0,
-			minP:        0.0,
-			seed:        0,
-			wantGreedy:  true,
-		},
-		{
-			name:        "all transforms",
-			temperature: 0.8,
-			topK:        50,
-			topP:        0.95,
-			minP:        0.1,
-			seed:        42,
-			wantGreedy:  false,
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			sampler := NewSampler(tt.temperature, tt.topK, tt.topP, tt.minP, tt.seed)
-			_, isGreedy := sampler.(*greedy)
-			if isGreedy != tt.wantGreedy {
-				t.Errorf("NewSampler() got greedy = %v, want %v", isGreedy, tt.wantGreedy)
-			}
-		})
-	}
-}
-
 func BenchmarkSample(b *testing.B) {
-	weighted := NewSampler(0.5, 10, 0.9, 0.2, -1)
 	samplers := map[string]Sampler{
-		"Greedy":   NewSampler(0, 0, 0, 0, 0), // Use NewSampler with temp=0 for greedy
-		"Weighted": weighted,
+		"Greedy":   NewSampler(0, 0, 0, 0, 0, nil), // Use NewSampler with temp=0 for greedy
+		"Weighted": NewSampler(0.5, 10, 0.9, 0.2, -1, nil),
 	}

 	// Generate random logits for benchmarking
@@ -132,7 +48,7 @@ func BenchmarkSample(b *testing.B) {
 			b.ResetTimer()
 			for b.Loop() {
 				if _, err := s.Sample(logits); err != nil {
-					b.Error(err)
+					b.Fatalf("error sampling: %v", err)
 				}
 			}
 		})
--- a/sample/transforms.go
+++ b/sample/transforms.go
@@ -5,7 +5,7 @@ import (
 	"slices"
 )

-func softmax(ts []logit) []logit {
+func softmax(ts []token) []token {
 	var sum float32
 	for i, v := range ts {
 		ts[i].value = float32(math.Exp(float64(v.value)))
@@ -19,7 +19,7 @@ func softmax(ts []logit) []logit {
 	return ts
 }

-func temperature(ti []logit, t float32) []logit {
+func temperature(ti []token, t float32) []token {
 	if t == 1 {
 		return ti
 	}
@@ -51,7 +51,7 @@ func temperature(ti []logit, t float32) []logit {
 // 1. Finds the smallest value between the node and its children
 // 2. If the node is not the smallest, swaps it with its smallest child
 // 3. Continues this process down the affected path until the min-heap property is restored
-func siftDown(data []logit, start, end int) {
+func siftDown(data []token, start, end int) {
 	root := start
 	for {
 		child := 2*root + 1
@@ -73,7 +73,7 @@ func siftDown(data []logit, start, end int) {
 }

 // topK limits the number of tokens considered to the k highest logits
-func topK(ts []logit, k int) []logit {
+func topK(ts []token, k int) []token {
 	if k >= len(ts) {
 		return ts
 	}
@@ -99,7 +99,7 @@ func topK(ts []logit, k int) []logit {
 }

 // topP limits tokens to those with cumulative probability p
-func topP(ts []logit, p float32) []logit {
+func topP(ts []token, p float32) []token {
 	if p == 1.0 {
 		return ts
 	}
@@ -118,7 +118,7 @@ func topP(ts []logit, p float32) []logit {
 }

 // minP limits tokens to those with cumulative probability p
-func minP(ts []logit, p float32) []logit {
+func minP(ts []token, p float32) []token {
 	if p == 1.0 {
 		return ts
 	}
@@ -146,7 +146,7 @@ func minP(ts []logit, p float32) []logit {

 // TODO(parthsareen): possibly replace with simpler implementation https://github.com/ollama/ollama/issues/9584
 // Conting sort implementation to sort tokens by logits
-func sortLogits(tokens []logit) {
+func sortLogits(tokens []token) {
 	if len(tokens) <= 1 {
 		return
 	}
@@ -187,7 +187,7 @@ func sortLogits(tokens []logit) {
 	}

 	// Second pass: place elements in correct position
-	output := make([]logit, len(tokens))
+	output := make([]token, len(tokens))
 	// Track current positions
 	countsCopy := counts

--- a/sample/transforms_test.go
+++ b/sample/transforms_test.go
@@ -7,10 +7,10 @@ import (
 )

 // Helper to convert float64 slice to logit slice
-func toLogits(values []float64) []logit {
-	tokens := make([]logit, len(values))
+func toTokens(values []float64) []token {
+	tokens := make([]token, len(values))
 	for i, v := range values {
-		tokens[i] = logit{
+		tokens[i] = token{
 			id:    int32(i),
 			value: float32(v),
 		}
@@ -19,7 +19,7 @@ func toLogits(values []float64) []logit {
 }

 // Helper to compare logit slices
-func compareLogits(t *testing.T, name string, want []float64, got []logit) {
+func compareLogits(t *testing.T, name string, want []float64, got []token) {
 	t.Helper()
 	if len(want) != len(got) {
 		t.Errorf("%s: length mismatch: want %d, got %d", name, len(want), len(got))
@@ -36,13 +36,13 @@ func TestTemperature(t *testing.T) {
 	input := []float64{2, -1, 4, -3, 1, -2, 0}
 	want := []float64{-4, -10, 0, -14, -6, -12, -8} // (logit - max logit) / temp

-	got := temperature(toLogits(input), 0.5)
+	got := temperature(toTokens(input), 0.5)
 	compareLogits(t, "Temperature", want, got)
 }

 func TestSoftmax(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4}
-	got := softmax(toLogits(input))
+	got := softmax(toTokens(input))

 	// Check probabilities sum to 1
 	var sum float32
@@ -65,7 +65,7 @@ func TestTopK(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4}

 	// Test k=3
-	got := topK(toLogits(input), 3)
+	got := topK(toTokens(input), 3)
 	if len(got) != 3 {
 		t.Errorf("topK(3): wrong length: want 3, got %d", len(got))
 	}
@@ -74,13 +74,13 @@ func TestTopK(t *testing.T) {
 	compareLogits(t, "topK(3)", want, got)

 	// Test k > len
-	got = topK(toLogits(input), 10)
+	got = topK(toTokens(input), 10)
 	compareLogits(t, "topK(10)", input, got)
 }

 func TestTopP(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4}
-	tokens := toLogits(input)
+	tokens := toTokens(input)

 	// First apply temperature and softmax to get probabilities
 	tokens = temperature(tokens, 1)
@@ -99,7 +99,7 @@ func TestTopP(t *testing.T) {

 func TestMinP(t *testing.T) {
 	input := []float64{-3, -2, -1, 0, 1, 2, 4, 3}
-	tokens := toLogits(input)
+	tokens := toTokens(input)

 	// First apply temperature and softmax
 	tokens = temperature(tokens, 1)
@@ -116,7 +116,7 @@ func TestMinP(t *testing.T) {

 func TestSortLogits(t *testing.T) {
 	input := []float64{3, 1, 4, 2, -1, 0, -2}
-	tokens := toLogits(input)
+	tokens := toTokens(input)

 	sortLogits(tokens)

@@ -133,15 +133,15 @@ func TestSortLogits(t *testing.T) {

 func BenchmarkTransforms(b *testing.B) {
 	// Generate random logits
-	tokens := make([]logit, 1<<16)
+	tokens := make([]token, 1<<16)
 	for i := range tokens {
-		tokens[i] = logit{
+		tokens[i] = token{
 			id:    int32(i),
 			value: rand.Float32(),
 		}
 	}

-	tokensCopy := make([]logit, len(tokens))
+	tokensCopy := make([]token, len(tokens))

 	b.Run("Temperature", func(b *testing.B) {
 		b.ResetTimer()