package kvcache import ( "fmt" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model/input" ) // Encoder cache stores K and V tensors that are position independent // // The tensors can be of any shape and will be returned as they were stored // The mask is currently always nil // // Not currently safe for multiple sequences type EncoderCache struct { // config controls mostly backend-specific optimizations config *ml.CacheConfig // ** current forward pass ** // the active layer for Get and Put curLayer int // if something is stored during this pass, this // will be the position (but there is no guarantee // anything will be stored) curPos int32 // ** cache metadata ** // was something stored in the cache? encoderCached bool // position of the cached data encoderPos int32 // ** cache data storage ** backend ml.Backend ctxs map[int]ml.Context keys, values map[int]ml.Tensor } func NewEncoderCache() *EncoderCache { return &EncoderCache{ ctxs: make(map[int]ml.Context), keys: make(map[int]ml.Tensor), values: make(map[int]ml.Tensor), } } func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) { if c.config == nil { var config ml.CacheConfig if cc, ok := backend.(ml.BackendCacheConfig); ok { config = cc.CacheConfig() } c.config = &config } if c.config.CachePadding != 0 && c.config.CachePadding != 1 { panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding)) } c.backend = backend } func (c *EncoderCache) SetConfig(config ml.CacheConfig) { if c.config != nil { panic("config cannot be changed after being previously set, either by the model or backend") } c.config = &config } func (c *EncoderCache) Close() { for _, ctx := range c.ctxs { ctx.Close() } } func (c *EncoderCache) StartForward(ctx ml.Context, opts input.Options) error { // We work with the most recent image if len(opts.Multimodal) > 0 { c.curPos = opts.Positions[opts.Multimodal[len(opts.Multimodal)-1].Index] } return nil } func (c *EncoderCache) SetLayer(layer int) { c.curLayer = layer } func (c *EncoderCache) EncoderCached() bool { return c.encoderCached } func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) { return c.keys[c.curLayer], c.values[c.curLayer], nil } func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) { c.encoderPos = c.curPos c.encoderCached = true if c.config.PermutedV { value = value.Permute(ctx, 1, 2, 0, 3) } if _, ok := c.ctxs[c.curLayer]; !ok { c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer) } if _, ok := c.keys[c.curLayer]; !ok { c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...) } if _, ok := c.values[c.curLayer]; !ok { c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...) } ctx.Forward( key.Copy(ctx, c.keys[c.curLayer]), value.Copy(ctx, c.values[c.curLayer]), ) } func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) { panic("encoder cache does not support multiple sequences") } func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error { if c.encoderPos >= beginIndex && c.encoderPos < endIndex { c.encoderCached = false } return nil }