mirror of
https://github.com/ollama/ollama.git
synced 2025-11-10 15:37:27 +01:00
kvcache: Pass granular cache size into implementations
Currently the runner computes the kv size needed and creates a cache of that size. This is the context size times number of parallel sequences. Cache implementations can make better decisions about their memory usage, so instead pass in the required capacity, number of sequences and maximum batch size. For now, the causal cache just uses this to compute the size in the same way as before.
This commit is contained in:
@@ -23,9 +23,9 @@ func NewWrapperCache(caches ...Cache) *WrapperCache {
|
||||
}
|
||||
}
|
||||
|
||||
func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
|
||||
func (c *WrapperCache) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity, maxBatch int) {
|
||||
for _, cache := range c.caches {
|
||||
cache.Init(backend, dtype, capacity)
|
||||
cache.Init(backend, dtype, maxSequences, capacity, maxBatch)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user