ggml: Disable flash attention for gemma2

Our new engine implementation of gemma2 doesn't support flash
attention, which means that it also doesn't support KV cache
quantization. Currently, it is possible to turn these two on,
which will result in a crash.
This commit is contained in:
Jesse Gross
2025-09-09 10:48:34 -07:00
committed by Jesse Gross
parent 71cb86af3e
commit 29ddfc2cab

View File

@@ -883,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool {
return false
}
if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
return false
}
// Check head counts match and are non-zero
headCountK := f.KV().EmbeddingHeadCountK()
headCountV := f.KV().EmbeddingHeadCountV()