From 29ddfc2cab7f5a83a96c3133094f67b22e4f27d1 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Tue, 9 Sep 2025 10:48:34 -0700
Subject: [PATCH] ggml: Disable flash attention for gemma2

Our new engine implementation of gemma2 doesn't support flash
attention, which means that it also doesn't support KV cache
quantization. Currently, it is possible to turn these two on,
which will result in a crash.
---
 fs/ggml/ggml.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 57476a9a88..6b582b4994 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -883,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool {
 		return false
 	}
 
+	if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
+		return false
+	}
+
 	// Check head counts match and are non-zero
 	headCountK := f.KV().EmbeddingHeadCountK()
 	headCountV := f.KV().EmbeddingHeadCountV()