mirror of
https://github.com/ollama/ollama.git
synced 2025-09-27 20:27:05 +02:00
ggml: Disable flash attention for gemma2
Our new engine implementation of gemma2 doesn't support flash attention, which means that it also doesn't support KV cache quantization. Currently, it is possible to turn these two on, which will result in a crash.
This commit is contained in:
@@ -883,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check head counts match and are non-zero
|
||||
headCountK := f.KV().EmbeddingHeadCountK()
|
||||
headCountV := f.KV().EmbeddingHeadCountV()
|
||||
|
Reference in New Issue
Block a user