mirror of
https://github.com/ollama/ollama.git
synced 2025-08-25 22:31:14 +02:00
gpt-oss: disable quantized kv cache (#11929)
This commit is contained in:
@@ -752,6 +752,11 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
|
|||||||
|
|
||||||
// SupportsKVCacheType checks if the requested cache type is supported
|
// SupportsKVCacheType checks if the requested cache type is supported
|
||||||
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
|
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
|
||||||
|
// gpt-oss uses attention with sinks which does not support quantized cache types
|
||||||
|
slog.Warn("model only supports non-quantized cache types ", "mode", arch)
|
||||||
|
return cacheType == "f16"
|
||||||
|
}
|
||||||
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user