ollama/kvcache/wrapper.go

package kvcache

import (
	"math"

	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/model/input"
)

// Wrapper cache is a container for multiple types of caches,
// such as for the encoding and decoding portions of a model.
type WrapperCache struct {
	// caches we are wrapping
	caches []Cache

	// cache to be used for this layer
	curType int
}

func NewWrapperCache(caches ...Cache) *WrapperCache {
	return &WrapperCache{
		caches: caches,
	}
}

func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
	for _, cache := range c.caches {
		cache.Init(backend, dtype, capacity)
	}
}

func (c *WrapperCache) SetConfig(config ml.CacheConfig) {
	for _, cache := range c.caches {
		cache.SetConfig(config)
	}
}

func (c *WrapperCache) Close() {
	for _, cache := range c.caches {
		cache.Close()
	}
}

func (c *WrapperCache) StartForward(ctx ml.Context, opts input.Options) error {
	for i, cache := range c.caches {
		err := cache.StartForward(ctx, opts)
		if err != nil {
			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
			for j := i - 1; j >= 0; j-- {
				for k := range opts.Positions {
					_ = c.caches[j].Remove(opts.Sequences[k], opts.Positions[k], math.MaxInt32)
				}
			}
			return err
		}
	}

	c.curType = 0
	return nil
}

func (c *WrapperCache) SetLayer(layer int) {
	for _, cache := range c.caches {
		cache.SetLayer(layer)
	}
}

func (c *WrapperCache) SetLayerType(layerType int) {
	c.curType = layerType
}

func (c *WrapperCache) UnderlyingCache() Cache {
	return c.caches[c.curType]
}

func (c *WrapperCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
	return c.caches[c.curType].Get(ctx)
}

func (c *WrapperCache) Put(ctx ml.Context, key, value ml.Tensor) {
	c.caches[c.curType].Put(ctx, key, value)
}

func (c *WrapperCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
	for _, cache := range c.caches {
		cache.CopyPrefix(srcSeq, dstSeq, len)
	}
}

func (c *WrapperCache) Remove(seq int, beginIndex, endIndex int32) error {
	// If the one of these fails, the caller is supposed to retry with endIndex set to math.MaxInt32, which should not fail
	for _, cache := range c.caches {
		err := cache.Remove(seq, beginIndex, endIndex)
		if err != nil {
			return err
		}
	}

	return nil
}
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-17 19:59:41 -08:00			`package kvcache`

			`import (`
			`"math"`

			`"github.com/ollama/ollama/ml"`
model: Update encoder cache to use multimodal input processing handler The encoder cache needs to know the position of images in the input stream so that it knows when to delete them. Previously images didn't have a position, so we implied one by breaking batches before an image and then assuming the image was in the first position. However, multimodal objects are now given explicit positions in the input stream, so we can use that instead. Breaking batches was also a way to simulate a cross attention mask for mllama. However, given that it only supports a single sequence and a single image, this mask doesn't serve any real purpose. Removing the batch break does not appear to affect the quality of the output. Most of this is simply moving the input data structures to a new package to avoid import cycles. 2025-03-08 15:45:31 -08:00			`"github.com/ollama/ollama/model/input"`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-17 19:59:41 -08:00			`)`

			`// Wrapper cache is a container for multiple types of caches,`
			`// such as for the encoding and decoding portions of a model.`
			`type WrapperCache struct {`
			`// caches we are wrapping`
			`caches []Cache`

			`// cache to be used for this layer`
			`curType int`
			`}`

			`func NewWrapperCache(caches ...Cache) *WrapperCache {`
			`return &WrapperCache{`
			`caches: caches,`
			`}`
			`}`

			`func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {`
			`for _, cache := range c.caches {`
			`cache.Init(backend, dtype, capacity)`
			`}`
			`}`

attention: Remove unnecessary contiguous operations Prior to performing attention, we need to permute query, key and value. Currently we call Contiguous after each of these permutations, which is correct but expensive. Avoiding the 3 calls to Contiguous increases performance by over 20%. The permutations of query and key do not violate the continuity rules for mulmat and the Contiguous call can be simply removed. Value requires a different permutation and does require Contiguous. However, we can use the copy into the cache as a way to perform this without further overhead. To support this and avoid unexpected tensor shapes that are seen by models, we need tighter integration between attention, cache and backend. Future optimization will also likely need this structure - for example, flash attention has special padding requirements in the cache and other backends may have their own needs. This further contains the operations that go into attention so that these and other optimizations can be handled transparently. Models that have special requirements for attention can still implement their own version of it. 2025-02-22 21:34:10 -08:00			`func (c *WrapperCache) SetConfig(config ml.CacheConfig) {`
			`for _, cache := range c.caches {`
			`cache.SetConfig(config)`
			`}`
			`}`

Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-17 19:59:41 -08:00			`func (c *WrapperCache) Close() {`
			`for _, cache := range c.caches {`
			`cache.Close()`
			`}`
			`}`

model: Update encoder cache to use multimodal input processing handler The encoder cache needs to know the position of images in the input stream so that it knows when to delete them. Previously images didn't have a position, so we implied one by breaking batches before an image and then assuming the image was in the first position. However, multimodal objects are now given explicit positions in the input stream, so we can use that instead. Breaking batches was also a way to simulate a cross attention mask for mllama. However, given that it only supports a single sequence and a single image, this mask doesn't serve any real purpose. Removing the batch break does not appear to affect the quality of the output. Most of this is simply moving the input data structures to a new package to avoid import cycles. 2025-03-08 15:45:31 -08:00			`func (c *WrapperCache) StartForward(ctx ml.Context, opts input.Options) error {`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-17 19:59:41 -08:00			`for i, cache := range c.caches {`
model: Update encoder cache to use multimodal input processing handler The encoder cache needs to know the position of images in the input stream so that it knows when to delete them. Previously images didn't have a position, so we implied one by breaking batches before an image and then assuming the image was in the first position. However, multimodal objects are now given explicit positions in the input stream, so we can use that instead. Breaking batches was also a way to simulate a cross attention mask for mllama. However, given that it only supports a single sequence and a single image, this mask doesn't serve any real purpose. Removing the batch break does not appear to affect the quality of the output. Most of this is simply moving the input data structures to a new package to avoid import cycles. 2025-03-08 15:45:31 -08:00			`err := cache.StartForward(ctx, opts)`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-17 19:59:41 -08:00			`if err != nil {`
			`// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail`
			`for j := i - 1; j >= 0; j-- {`
model: Update encoder cache to use multimodal input processing handler The encoder cache needs to know the position of images in the input stream so that it knows when to delete them. Previously images didn't have a position, so we implied one by breaking batches before an image and then assuming the image was in the first position. However, multimodal objects are now given explicit positions in the input stream, so we can use that instead. Breaking batches was also a way to simulate a cross attention mask for mllama. However, given that it only supports a single sequence and a single image, this mask doesn't serve any real purpose. Removing the batch break does not appear to affect the quality of the output. Most of this is simply moving the input data structures to a new package to avoid import cycles. 2025-03-08 15:45:31 -08:00			`for k := range opts.Positions {`
			`_ = c.caches[j].Remove(opts.Sequences[k], opts.Positions[k], math.MaxInt32)`
Runner for Ollama engine This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1 2024-12-17 19:59:41 -08:00			`}`
			`}`
			`return err`
			`}`
			`}`

			`c.curType = 0`
			`return nil`
			`}`

			`func (c *WrapperCache) SetLayer(layer int) {`
			`for _, cache := range c.caches {`
			`cache.SetLayer(layer)`
			`}`
			`}`

			`func (c *WrapperCache) SetLayerType(layerType int) {`
			`c.curType = layerType`
			`}`

			`func (c *WrapperCache) UnderlyingCache() Cache {`
			`return c.caches[c.curType]`
			`}`

			`func (c *WrapperCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {`
			`return c.caches[c.curType].Get(ctx)`
			`}`

			`func (c *WrapperCache) Put(ctx ml.Context, key, value ml.Tensor) {`
			`c.caches[c.curType].Put(ctx, key, value)`
			`}`

			`func (c *WrapperCache) CopyPrefix(srcSeq, dstSeq int, len int32) {`
			`for _, cache := range c.caches {`
			`cache.CopyPrefix(srcSeq, dstSeq, len)`
			`}`
			`}`

			`func (c *WrapperCache) Remove(seq int, beginIndex, endIndex int32) error {`
			`// If the one of these fails, the caller is supposed to retry with endIndex set to math.MaxInt32, which should not fail`
			`for _, cache := range c.caches {`
			`err := cache.Remove(seq, beginIndex, endIndex)`
			`if err != nil {`
			`return err`
			`}`
			`}`

			`return nil`
			`}`