ollama/kvcache/encoder.go
jmorganca b42970063d kvcache: Add check for values that fall out of sliding window cache
The sliding window cache trims entries that are outside the window for
the latest token. This works when we are extending the cache, such as
when the conversation continues. However, if we have a partial overlap
in conversation (including the BOS tokens), then we resume from a past
point in the conversation and the needed tokens are no longer stored
in memory. This verifies that the new window overlaps with the old one
before reusing the cache.

Co-authored-by: Jesse Gross <jesse@ollama.com>
2025-04-02 11:55:48 -07:00

148 lines
3.5 KiB
Go

package kvcache
import (
"fmt"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/input"
)
// Encoder cache stores K and V tensors that are position independent
//
// The tensors can be of any shape and will be returned as they were stored
// The mask is currently always nil
//
// Not currently safe for multiple sequences
type EncoderCache struct {
// config controls mostly backend-specific optimizations
config *ml.CacheConfig
// ** current forward pass **
// the active layer for Get and Put
curLayer int
// if something is stored during this pass, this
// will be the position (but there is no guarantee
// anything will be stored)
curPos int32
// ** cache metadata **
// was something stored in the cache?
encoderCached bool
// position of the cached data
encoderPos int32
// ** cache data storage **
backend ml.Backend
ctxs map[int]ml.Context
keys, values map[int]ml.Tensor
}
func NewEncoderCache() *EncoderCache {
return &EncoderCache{
ctxs: make(map[int]ml.Context),
keys: make(map[int]ml.Tensor),
values: make(map[int]ml.Tensor),
}
}
func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
if c.config == nil {
var config ml.CacheConfig
if cc, ok := backend.(ml.BackendCacheConfig); ok {
config = cc.CacheConfig()
}
c.config = &config
}
if maxSequences > 1 {
panic(fmt.Errorf("encoder cache does not support multiple sequences; requested: %v", maxSequences))
}
if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
}
c.backend = backend
}
func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
if c.config != nil {
panic("config cannot be changed after being previously set, either by the model or backend")
}
c.config = &config
}
func (c *EncoderCache) Close() {
for _, ctx := range c.ctxs {
ctx.Close()
}
}
func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch) error {
// We work with the most recent image
if len(batch.Multimodal) > 0 {
c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
}
return nil
}
func (c *EncoderCache) SetLayer(layer int) {
c.curLayer = layer
}
func (c *EncoderCache) EncoderCached() bool {
return c.encoderCached
}
func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
return c.keys[c.curLayer], c.values[c.curLayer], nil
}
func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
c.encoderPos = c.curPos
c.encoderCached = true
if c.config.PermutedV {
value = value.Permute(ctx, 1, 2, 0, 3)
}
if _, ok := c.ctxs[c.curLayer]; !ok {
c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
}
if _, ok := c.keys[c.curLayer]; !ok {
c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
}
if _, ok := c.values[c.curLayer]; !ok {
c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
}
ctx.Forward(
key.Copy(ctx, c.keys[c.curLayer]),
value.Copy(ctx, c.values[c.curLayer]),
)
}
func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
panic("encoder cache does not support multiple sequences")
}
func (c *EncoderCache) CanResume(seq int, pos int32) bool {
return true
}
func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
c.encoderCached = false
}
return nil
}