mirror of
https://github.com/ollama/ollama.git
synced 2025-03-19 06:11:51 +01:00
The encoder cache needs to know the position of images in the input stream so that it knows when to delete them. Previously images didn't have a position, so we implied one by breaking batches before an image and then assuming the image was in the first position. However, multimodal objects are now given explicit positions in the input stream, so we can use that instead. Breaking batches was also a way to simulate a cross attention mask for mllama. However, given that it only supports a single sequence and a single image, this mask doesn't serve any real purpose. Removing the batch break does not appear to affect the quality of the output. Most of this is simply moving the input data structures to a new package to avoid import cycles.
140 lines
3.2 KiB
Go
140 lines
3.2 KiB
Go
package kvcache
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/model/input"
|
|
)
|
|
|
|
// Encoder cache stores K and V tensors that are position independent
|
|
//
|
|
// The tensors can be of any shape and will be returned as they were stored
|
|
// The mask is currently always nil
|
|
//
|
|
// Not currently safe for multiple sequences
|
|
type EncoderCache struct {
|
|
// config controls mostly backend-specific optimizations
|
|
config *ml.CacheConfig
|
|
|
|
// ** current forward pass **
|
|
|
|
// the active layer for Get and Put
|
|
curLayer int
|
|
|
|
// if something is stored during this pass, this
|
|
// will be the position (but there is no guarantee
|
|
// anything will be stored)
|
|
curPos int32
|
|
|
|
// ** cache metadata **
|
|
|
|
// was something stored in the cache?
|
|
encoderCached bool
|
|
|
|
// position of the cached data
|
|
encoderPos int32
|
|
|
|
// ** cache data storage **
|
|
backend ml.Backend
|
|
ctxs map[int]ml.Context
|
|
keys, values map[int]ml.Tensor
|
|
}
|
|
|
|
func NewEncoderCache() *EncoderCache {
|
|
return &EncoderCache{
|
|
ctxs: make(map[int]ml.Context),
|
|
keys: make(map[int]ml.Tensor),
|
|
values: make(map[int]ml.Tensor),
|
|
}
|
|
}
|
|
|
|
func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
|
|
if c.config == nil {
|
|
var config ml.CacheConfig
|
|
if cc, ok := backend.(ml.BackendCacheConfig); ok {
|
|
config = cc.CacheConfig()
|
|
}
|
|
c.config = &config
|
|
}
|
|
|
|
if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
|
|
panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
|
|
}
|
|
|
|
c.backend = backend
|
|
}
|
|
|
|
func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
|
|
if c.config != nil {
|
|
panic("config cannot be changed after being previously set, either by the model or backend")
|
|
}
|
|
|
|
c.config = &config
|
|
}
|
|
|
|
func (c *EncoderCache) Close() {
|
|
for _, ctx := range c.ctxs {
|
|
ctx.Close()
|
|
}
|
|
}
|
|
|
|
func (c *EncoderCache) StartForward(ctx ml.Context, opts input.Options) error {
|
|
// We work with the most recent image
|
|
if len(opts.Multimodal) > 0 {
|
|
c.curPos = opts.Positions[opts.Multimodal[len(opts.Multimodal)-1].Index]
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *EncoderCache) SetLayer(layer int) {
|
|
c.curLayer = layer
|
|
}
|
|
|
|
func (c *EncoderCache) EncoderCached() bool {
|
|
return c.encoderCached
|
|
}
|
|
|
|
func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
|
|
return c.keys[c.curLayer], c.values[c.curLayer], nil
|
|
}
|
|
|
|
func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
|
|
c.encoderPos = c.curPos
|
|
c.encoderCached = true
|
|
|
|
if c.config.PermutedV {
|
|
value = value.Permute(ctx, 1, 2, 0, 3)
|
|
}
|
|
|
|
if _, ok := c.ctxs[c.curLayer]; !ok {
|
|
c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
|
|
}
|
|
|
|
if _, ok := c.keys[c.curLayer]; !ok {
|
|
c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
|
|
}
|
|
|
|
if _, ok := c.values[c.curLayer]; !ok {
|
|
c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
|
|
}
|
|
|
|
ctx.Forward(
|
|
key.Copy(ctx, c.keys[c.curLayer]),
|
|
value.Copy(ctx, c.values[c.curLayer]),
|
|
)
|
|
}
|
|
|
|
func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
|
|
panic("encoder cache does not support multiple sequences")
|
|
}
|
|
|
|
func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
|
|
if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
|
|
c.encoderCached = false
|
|
}
|
|
|
|
return nil
|
|
}
|