From 1feff619779115d76f033eb59a7a896aad6c2e18 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 24 Mar 2025 21:17:53 -0700 Subject: [PATCH] kvcache: Sliding window cache only needs a single batch total When computing the size of the cache for sliding window attention, we don't need to multiple the batch size by the number of parallel sequences - the batch size is constant. This also simplifies the check for whether to allocate the cache size based on capacity or window size as the batch size is already incorporated into the capacity when handled by the runner. --- kvcache/causal.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kvcache/causal.go b/kvcache/causal.go index aacaf540f..fb4f0f743 100644 --- a/kvcache/causal.go +++ b/kvcache/causal.go @@ -119,10 +119,10 @@ func (c *Causal) Init(backend ml.Backend, dtype ml.DType, maxSequences, capacity } var cacheSize int - if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize)+maxBatch { + if c.windowSize == math.MaxInt32 || capacity < int(c.windowSize) { cacheSize = maxSequences * capacity } else { - cacheSize = maxSequences * (int(c.windowSize) + maxBatch) + cacheSize = (maxSequences * int(c.windowSize)) + maxBatch } cacheSize = roundUp(cacheSize, c.config.CachePadding) c.cells = make([]cacheCell, cacheSize)