From df335aac09cd921beab12891cb40957e7c2ef9b7 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 15 Aug 2025 15:01:05 -0700 Subject: [PATCH] gpt-oss: disable quantized kv cache (#11929) --- fs/ggml/ggml.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 1fef745249..a739e99ba9 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -752,6 +752,11 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { + if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) { + // gpt-oss uses attention with sinks which does not support quantized cache types + slog.Warn("model only supports non-quantized cache types ", "mode", arch) + return cacheType == "f16" + } return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) }