From 29ddfc2cab7f5a83a96c3133094f67b22e4f27d1 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 9 Sep 2025 10:48:34 -0700 Subject: [PATCH] ggml: Disable flash attention for gemma2 Our new engine implementation of gemma2 doesn't support flash attention, which means that it also doesn't support KV cache quantization. Currently, it is possible to turn these two on, which will result in a crash. --- fs/ggml/ggml.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 57476a9a88..6b582b4994 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -883,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool { return false } + if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) { + return false + } + // Check head counts match and are non-zero headCountK := f.KV().EmbeddingHeadCountK() headCountV := f.KV().EmbeddingHeadCountV()