mirror of
https://github.com/ollama/ollama.git
synced 2025-04-14 06:39:30 +02:00
Currently the runner computes the kv size needed and creates a cache of that size. This is the context size times number of parallel sequences. Cache implementations can make better decisions about their memory usage, so instead pass in the required capacity, number of sequences and maximum batch size. For now, the causal cache just uses this to compute the size in the same way as before.
144 lines
3.4 KiB
Go
144 lines
3.4 KiB
Go
package kvcache
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/model/input"
|
|
)
|
|
|
|
// Encoder cache stores K and V tensors that are position independent
|
|
//
|
|
// The tensors can be of any shape and will be returned as they were stored
|
|
// The mask is currently always nil
|
|
//
|
|
// Not currently safe for multiple sequences
|
|
type EncoderCache struct {
|
|
// config controls mostly backend-specific optimizations
|
|
config *ml.CacheConfig
|
|
|
|
// ** current forward pass **
|
|
|
|
// the active layer for Get and Put
|
|
curLayer int
|
|
|
|
// if something is stored during this pass, this
|
|
// will be the position (but there is no guarantee
|
|
// anything will be stored)
|
|
curPos int32
|
|
|
|
// ** cache metadata **
|
|
|
|
// was something stored in the cache?
|
|
encoderCached bool
|
|
|
|
// position of the cached data
|
|
encoderPos int32
|
|
|
|
// ** cache data storage **
|
|
backend ml.Backend
|
|
ctxs map[int]ml.Context
|
|
keys, values map[int]ml.Tensor
|
|
}
|
|
|
|
func NewEncoderCache() *EncoderCache {
|
|
return &EncoderCache{
|
|
ctxs: make(map[int]ml.Context),
|
|
keys: make(map[int]ml.Tensor),
|
|
values: make(map[int]ml.Tensor),
|
|
}
|
|
}
|
|
|
|
func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
|
|
if c.config == nil {
|
|
var config ml.CacheConfig
|
|
if cc, ok := backend.(ml.BackendCacheConfig); ok {
|
|
config = cc.CacheConfig()
|
|
}
|
|
c.config = &config
|
|
}
|
|
|
|
if maxSequences > 1 {
|
|
panic(fmt.Errorf("encoder cache does not support multiple sequences; requested: %v", maxSequences))
|
|
}
|
|
|
|
if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
|
|
panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
|
|
}
|
|
|
|
c.backend = backend
|
|
}
|
|
|
|
func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
|
|
if c.config != nil {
|
|
panic("config cannot be changed after being previously set, either by the model or backend")
|
|
}
|
|
|
|
c.config = &config
|
|
}
|
|
|
|
func (c *EncoderCache) Close() {
|
|
for _, ctx := range c.ctxs {
|
|
ctx.Close()
|
|
}
|
|
}
|
|
|
|
func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch) error {
|
|
// We work with the most recent image
|
|
if len(batch.Multimodal) > 0 {
|
|
c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *EncoderCache) SetLayer(layer int) {
|
|
c.curLayer = layer
|
|
}
|
|
|
|
func (c *EncoderCache) EncoderCached() bool {
|
|
return c.encoderCached
|
|
}
|
|
|
|
func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
|
|
return c.keys[c.curLayer], c.values[c.curLayer], nil
|
|
}
|
|
|
|
func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
|
|
c.encoderPos = c.curPos
|
|
c.encoderCached = true
|
|
|
|
if c.config.PermutedV {
|
|
value = value.Permute(ctx, 1, 2, 0, 3)
|
|
}
|
|
|
|
if _, ok := c.ctxs[c.curLayer]; !ok {
|
|
c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
|
|
}
|
|
|
|
if _, ok := c.keys[c.curLayer]; !ok {
|
|
c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
|
|
}
|
|
|
|
if _, ok := c.values[c.curLayer]; !ok {
|
|
c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
|
|
}
|
|
|
|
ctx.Forward(
|
|
key.Copy(ctx, c.keys[c.curLayer]),
|
|
value.Copy(ctx, c.values[c.curLayer]),
|
|
)
|
|
}
|
|
|
|
func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
|
|
panic("encoder cache does not support multiple sequences")
|
|
}
|
|
|
|
func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
|
|
if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
|
|
c.encoderCached = false
|
|
}
|
|
|
|
return nil
|
|
}
|