From 8253ad4d2b2e7ac58268192051b92b59986c874f Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 5 Aug 2025 12:42:07 -0700 Subject: [PATCH] ggml: Prevent kv cache quanitization on gpt-oss KV cache quantization has a dependency on the flash attention kernel. We currently cannot use flash attention with gpt-oss as it requires additional operations. The model definition does not call flash attention, so it works regardless of the setting but the cache will pick up the quantization type. This updates the flash attention setting earlier in the loading flow so that all downstream settings are also set correctly. Fixes: #11671 --- fs/ggml/ggml.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index afb90720fb..fb993a288c 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -761,6 +761,10 @@ func (f GGML) SupportsFlashAttention() bool { return false } + if f.KV().Architecture() == "gptoss" { + return false + } + // Check head counts match and are non-zero headCountK := f.KV().EmbeddingHeadCountK() headCountV := f.KV().EmbeddingHeadCountV()