sample: temporarily use grammars for constrained generation in new engine (#9586)

This commit is contained in:
Jeffrey Morgan
2025-03-10 16:17:39 +01:00
committed by GitHub
parent a1cda80bcb
commit e093db92c4
10 changed files with 301 additions and 213 deletions

View File

@@ -2,43 +2,88 @@ package sample
import (
"errors"
"math"
"math/rand/v2"
"slices"
"sync"
"github.com/ollama/ollama/llama"
)
// Sampler is not thread-safe. Each goroutine should have its own instance
type Sampler interface {
Sample([]float32) (int32, error)
}
// logit represents information about a single token during sampling
type logit struct {
// token represents information about a single token during sampling
type token struct {
id int32 // The token's unique identifier
value float32 // The raw logit or probability from the model
}
type weighted struct {
type Sampler struct {
rng *rand.Rand
tokens []logit
topK int
topP float32
minP float32
temperature float32
grammar *Grammar
}
func (s *weighted) Sample(logits []float32) (int32, error) {
if len(s.tokens) < len(logits) {
s.tokens = make([]logit, len(logits))
}
tokens := s.tokens[:len(logits)]
for i, v := range logits {
func (s *Sampler) Sample(logits []float32) (int32, error) {
tokens := make([]token, len(logits))
for i := range logits {
tokens[i].id = int32(i)
tokens[i].value = v
tokens[i].value = logits[i]
}
t, err := s.sample(tokens)
if err != nil {
return -1, err
}
if s.grammar != nil {
// optimization: first check if the max logit is accepted by the grammar
// if the max logit is rejected, apply the grammar to all logits (slower)
top := []token{t}
s.grammar.Apply(top)
if !math.IsInf(float64(top[0].value), -1) {
s.grammar.Accept(top[0].id)
return top[0].id, nil
}
// since .sample has side effects of modifying the tokens
// we need to reset them before applying the grammar and
// sampling again
for i := range logits {
tokens[i].id = int32(i)
tokens[i].value = logits[i]
}
s.grammar.Apply(tokens)
t, err = s.sample(tokens)
if err != nil {
return -1, err
}
s.grammar.Accept(t.id)
}
return t.id, nil
}
// greedy returns the highest probability token from the tokens
func greedy(tokens []token) token {
max := tokens[0]
for i := 1; i < len(tokens); i++ {
if tokens[i].value > max.value {
max = tokens[i]
}
}
return max
}
// sample returns the highest probability token from the tokens
// given sampler parameters. It also has side effects of modifying the tokens
func (s *Sampler) sample(tokens []token) (token, error) {
if s.temperature == 0 {
return greedy(tokens), nil
}
// Tokens are sorted by logits in TopK or SortTokens
if s.topK > 0 {
tokens = topK(tokens, s.topK)
} else {
@@ -47,12 +92,14 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
tokens = temperature(tokens, s.temperature)
tokens = softmax(tokens)
tokens = topP(tokens, s.topP)
tokens = minP(tokens, s.minP)
// TODO: this should fall back to greedy sampling
// or topP, topK values etc should be such that
// there are always tokens to sample from
if len(tokens) == 0 {
return -1, errors.New("no valid logits found for weighted sampling")
return token{}, errors.New("no tokens to sample from")
}
var r float32
@@ -70,48 +117,18 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
}
r *= tokens[len(tokens)-1].value
idx, _ := slices.BinarySearchFunc(tokens, r, func(token logit, target float32) int {
// Compare cumulative probabilities
idx, _ := slices.BinarySearchFunc(tokens, r, func(token token, target float32) int {
if token.value < target {
return -1
}
// First token that exceeds target
return 1
})
if idx >= len(tokens) {
idx = len(tokens) - 1
}
return tokens[idx].id, nil
}
type greedy struct{}
// Greedy sample returns the index of the maximum value in logits.
func (s greedy) Sample(logits []float32) (int32, error) {
if len(logits) == 0 {
return -1, errors.New("no logits provided for greedy sampling")
}
maxIdx := 0
maxVal := logits[0]
for i := 1; i < len(logits); i++ {
if logits[i] > maxVal {
maxVal = logits[i]
maxIdx = i
}
}
return int32(maxIdx), nil
return tokens[idx], nil
}
// TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int) Sampler {
if temperature == 0 {
return &greedy{}
}
func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
var rng *rand.Rand
if seed != -1 {
// PCG requires two parameters: sequence and stream
@@ -120,7 +137,9 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
// Use golden ratio hash to generate statistically independent seeds
rng = rand.New(rand.NewPCG(sequence, sequence^0x9E3779B9))
}
temperature = max(temperature, 1)
if temperature < 0.0 {
temperature = 0.0
}
if topP < 0.0 {
topP = 0.0
@@ -136,11 +155,73 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
minP = 1.0
}
return &weighted{
return Sampler{
rng: rng,
topK: topK,
topP: topP,
minP: minP,
temperature: temperature,
grammar: grammar,
}
}
type Grammar struct {
vocab *Vocab
grammar string
sampler *llama.Sampler
}
func NewGrammar(vocab *Vocab, grammar string) (*Grammar, error) {
v, err := vocab.Load()
if err != nil {
return nil, err
}
return &Grammar{
vocab: vocab,
grammar: grammar,
sampler: llama.NewGrammarSampler(v, grammar),
}, nil
}
func (g *Grammar) Apply(tokens []token) {
tds := make([]llama.TokenData, len(tokens))
for i, token := range tokens {
tds[i].Id = token.id
tds[i].Logit = token.value
}
g.sampler.Apply(tds)
for i := range tokens {
tokens[i].value = tds[i].Logit
}
}
func (g *Grammar) Accept(token int32) {
g.sampler.Accept(token)
}
type Vocab struct {
once sync.Once
vocab *llama.Vocab
err error
path string
}
func NewVocab(path string) *Vocab {
return &Vocab{path: path}
}
// Load returns the lazily-loaded vocabulary
func (v *Vocab) Load() (*llama.Vocab, error) {
v.once.Do(func() {
vocab, err := llama.LoadVocabFromFile(v.path)
if err != nil {
v.err = err
return
}
v.vocab = vocab
})
return v.vocab, v.err
}

View File

@@ -16,13 +16,10 @@ func BenchmarkWeightedSampler(b *testing.B) {
logits[i] = float32(rand.Float64()*10 - 5)
}
sampler := NewSampler(0.8, 0, 0, 0, 42)
sampler := NewSampler(0.8, 0, 0, 0, 42, nil)
b.ResetTimer()
for b.Loop() {
_, err := sampler.Sample(logits)
if err != nil {
b.Fatalf("Sampling failed: %v", err)
}
sampler.Sample(logits)
}
})
}
@@ -52,30 +49,24 @@ func BenchmarkWeightedSampler(b *testing.B) {
for _, tc := range configs {
b.Run("Config"+tc.name, func(b *testing.B) {
sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed)
sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed, nil)
sampler.Sample(logits)
b.ResetTimer()
for b.Loop() {
_, err := sampler.Sample(logits)
if err != nil {
b.Fatalf("Sampling failed: %v", err)
}
sampler.Sample(logits)
}
})
}
// Test with combined transforms separately - topK influences performance greatly
b.Run("TransformCombined", func(b *testing.B) {
sampler := NewSampler(0.8, 50, 0.9, 0.05, 42)
sampler := NewSampler(0.8, 50, 0.9, 0.05, 42, nil)
b.ResetTimer()
for b.Loop() {
_, err := sampler.Sample(logits)
if err != nil {
b.Fatalf("Sampling failed: %v", err)
}
sampler.Sample(logits)
}
})
}
@@ -90,14 +81,11 @@ func BenchmarkGreedySampler(b *testing.B) {
logits[i] = float32(rand.Float64()*10 - 5)
}
sampler := NewSampler(0, -1, 0, 0, -1)
sampler := NewSampler(0, -1, 0, 0, -1, nil)
b.ResetTimer()
for b.Loop() {
_, err := sampler.Sample(logits)
if err != nil {
b.Fatalf("Sampling failed: %v", err)
}
sampler.Sample(logits)
}
})
}

View File

@@ -7,7 +7,7 @@ import (
func TestWeighted(t *testing.T) {
logits := []float32{-10, 3, -10, -10}
sampler := NewSampler(0, 0, 0, 0, 0)
sampler := NewSampler(0, 0, 0, 0, 0, nil)
got, err := sampler.Sample(logits)
if err != nil {
t.Error(err)
@@ -19,7 +19,7 @@ func TestWeighted(t *testing.T) {
}
logits = []float32{-100, -10, 0, 10}
sampler = NewSampler(0, 0, 0, 0, 0)
sampler = NewSampler(0, 0, 0, 0, 0, nil)
got, err = sampler.Sample(logits)
if err != nil {
t.Error(err)
@@ -31,94 +31,10 @@ func TestWeighted(t *testing.T) {
}
}
func TestNewSampler(t *testing.T) {
tests := []struct {
name string
temperature float32
topK int
topP float32
minP float32
seed int
wantGreedy bool // Instead of wantErr, check if we get greedy sampler
}{
{
name: "temperature",
temperature: 0.5,
wantGreedy: false,
},
{
name: "zero temperature - greedy",
temperature: 0,
wantGreedy: true,
},
{
name: "top k",
temperature: 0.1,
topK: 10,
wantGreedy: false,
},
{
name: "top p",
temperature: 0.1,
topP: 0.9,
wantGreedy: false,
},
{
name: "min p",
temperature: 0.1,
minP: 0.2,
wantGreedy: false,
},
{
name: "seed - weighted",
temperature: 0.1,
seed: 42,
wantGreedy: false,
},
{
name: "default values",
temperature: 0.8,
topK: 40,
topP: 0.9,
minP: 0.0,
seed: 0,
wantGreedy: false,
},
{
name: "all zeroes - greedy",
temperature: 0.0,
topK: 0,
topP: 0.0,
minP: 0.0,
seed: 0,
wantGreedy: true,
},
{
name: "all transforms",
temperature: 0.8,
topK: 50,
topP: 0.95,
minP: 0.1,
seed: 42,
wantGreedy: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
sampler := NewSampler(tt.temperature, tt.topK, tt.topP, tt.minP, tt.seed)
_, isGreedy := sampler.(*greedy)
if isGreedy != tt.wantGreedy {
t.Errorf("NewSampler() got greedy = %v, want %v", isGreedy, tt.wantGreedy)
}
})
}
}
func BenchmarkSample(b *testing.B) {
weighted := NewSampler(0.5, 10, 0.9, 0.2, -1)
samplers := map[string]Sampler{
"Greedy": NewSampler(0, 0, 0, 0, 0), // Use NewSampler with temp=0 for greedy
"Weighted": weighted,
"Greedy": NewSampler(0, 0, 0, 0, 0, nil), // Use NewSampler with temp=0 for greedy
"Weighted": NewSampler(0.5, 10, 0.9, 0.2, -1, nil),
}
// Generate random logits for benchmarking
@@ -132,7 +48,7 @@ func BenchmarkSample(b *testing.B) {
b.ResetTimer()
for b.Loop() {
if _, err := s.Sample(logits); err != nil {
b.Error(err)
b.Fatalf("error sampling: %v", err)
}
}
})

View File

@@ -5,7 +5,7 @@ import (
"slices"
)
func softmax(ts []logit) []logit {
func softmax(ts []token) []token {
var sum float32
for i, v := range ts {
ts[i].value = float32(math.Exp(float64(v.value)))
@@ -19,7 +19,7 @@ func softmax(ts []logit) []logit {
return ts
}
func temperature(ti []logit, t float32) []logit {
func temperature(ti []token, t float32) []token {
if t == 1 {
return ti
}
@@ -51,7 +51,7 @@ func temperature(ti []logit, t float32) []logit {
// 1. Finds the smallest value between the node and its children
// 2. If the node is not the smallest, swaps it with its smallest child
// 3. Continues this process down the affected path until the min-heap property is restored
func siftDown(data []logit, start, end int) {
func siftDown(data []token, start, end int) {
root := start
for {
child := 2*root + 1
@@ -73,7 +73,7 @@ func siftDown(data []logit, start, end int) {
}
// topK limits the number of tokens considered to the k highest logits
func topK(ts []logit, k int) []logit {
func topK(ts []token, k int) []token {
if k >= len(ts) {
return ts
}
@@ -99,7 +99,7 @@ func topK(ts []logit, k int) []logit {
}
// topP limits tokens to those with cumulative probability p
func topP(ts []logit, p float32) []logit {
func topP(ts []token, p float32) []token {
if p == 1.0 {
return ts
}
@@ -118,7 +118,7 @@ func topP(ts []logit, p float32) []logit {
}
// minP limits tokens to those with cumulative probability p
func minP(ts []logit, p float32) []logit {
func minP(ts []token, p float32) []token {
if p == 1.0 {
return ts
}
@@ -146,7 +146,7 @@ func minP(ts []logit, p float32) []logit {
// TODO(parthsareen): possibly replace with simpler implementation https://github.com/ollama/ollama/issues/9584
// Conting sort implementation to sort tokens by logits
func sortLogits(tokens []logit) {
func sortLogits(tokens []token) {
if len(tokens) <= 1 {
return
}
@@ -187,7 +187,7 @@ func sortLogits(tokens []logit) {
}
// Second pass: place elements in correct position
output := make([]logit, len(tokens))
output := make([]token, len(tokens))
// Track current positions
countsCopy := counts

View File

@@ -7,10 +7,10 @@ import (
)
// Helper to convert float64 slice to logit slice
func toLogits(values []float64) []logit {
tokens := make([]logit, len(values))
func toTokens(values []float64) []token {
tokens := make([]token, len(values))
for i, v := range values {
tokens[i] = logit{
tokens[i] = token{
id: int32(i),
value: float32(v),
}
@@ -19,7 +19,7 @@ func toLogits(values []float64) []logit {
}
// Helper to compare logit slices
func compareLogits(t *testing.T, name string, want []float64, got []logit) {
func compareLogits(t *testing.T, name string, want []float64, got []token) {
t.Helper()
if len(want) != len(got) {
t.Errorf("%s: length mismatch: want %d, got %d", name, len(want), len(got))
@@ -36,13 +36,13 @@ func TestTemperature(t *testing.T) {
input := []float64{2, -1, 4, -3, 1, -2, 0}
want := []float64{-4, -10, 0, -14, -6, -12, -8} // (logit - max logit) / temp
got := temperature(toLogits(input), 0.5)
got := temperature(toTokens(input), 0.5)
compareLogits(t, "Temperature", want, got)
}
func TestSoftmax(t *testing.T) {
input := []float64{-3, -2, -1, 0, 1, 2, 4}
got := softmax(toLogits(input))
got := softmax(toTokens(input))
// Check probabilities sum to 1
var sum float32
@@ -65,7 +65,7 @@ func TestTopK(t *testing.T) {
input := []float64{-3, -2, -1, 0, 1, 2, 4}
// Test k=3
got := topK(toLogits(input), 3)
got := topK(toTokens(input), 3)
if len(got) != 3 {
t.Errorf("topK(3): wrong length: want 3, got %d", len(got))
}
@@ -74,13 +74,13 @@ func TestTopK(t *testing.T) {
compareLogits(t, "topK(3)", want, got)
// Test k > len
got = topK(toLogits(input), 10)
got = topK(toTokens(input), 10)
compareLogits(t, "topK(10)", input, got)
}
func TestTopP(t *testing.T) {
input := []float64{-3, -2, -1, 0, 1, 2, 4}
tokens := toLogits(input)
tokens := toTokens(input)
// First apply temperature and softmax to get probabilities
tokens = temperature(tokens, 1)
@@ -99,7 +99,7 @@ func TestTopP(t *testing.T) {
func TestMinP(t *testing.T) {
input := []float64{-3, -2, -1, 0, 1, 2, 4, 3}
tokens := toLogits(input)
tokens := toTokens(input)
// First apply temperature and softmax
tokens = temperature(tokens, 1)
@@ -116,7 +116,7 @@ func TestMinP(t *testing.T) {
func TestSortLogits(t *testing.T) {
input := []float64{3, 1, 4, 2, -1, 0, -2}
tokens := toLogits(input)
tokens := toTokens(input)
sortLogits(tokens)
@@ -133,15 +133,15 @@ func TestSortLogits(t *testing.T) {
func BenchmarkTransforms(b *testing.B) {
// Generate random logits
tokens := make([]logit, 1<<16)
tokens := make([]token, 1<<16)
for i := range tokens {
tokens[i] = logit{
tokens[i] = token{
id: int32(i),
value: rand.Float32(),
}
}
tokensCopy := make([]logit, len(tokens))
tokensCopy := make([]token, len(tokens))
b.Run("Temperature", func(b *testing.B) {
b.ResetTimer()