mirror of
https://github.com/ollama/ollama.git
synced 2025-03-18 05:41:43 +01:00
ml: Add support for quantized KV cache
Similar to the llama engine, quantizing the KV cache requires flash attention to be enabled through the Ollama server.
This commit is contained in:
parent
f52b2615ef
commit
4100ed7bdd
@ -215,7 +215,7 @@ func Dump(ctx Context, t Tensor, opts ...DumpOptions) string {
|
||||
return dump[[]float32](ctx, t, opts[0].Items, func(f float32) string {
|
||||
return strconv.FormatFloat(float64(f), 'f', opts[0].Precision, 32)
|
||||
})
|
||||
case DTypeF16:
|
||||
case DTypeF16, DTypeQ80, DTypeQ40:
|
||||
f32 := ctx.Empty(DTypeF32, t.Shape()...)
|
||||
f32 = t.Copy(ctx, f32)
|
||||
return dump[[]float32](ctx, f32, opts[0].Items, func(f float32) string {
|
||||
@ -283,5 +283,7 @@ const (
|
||||
DTypeOther DType = iota
|
||||
DTypeF32
|
||||
DTypeF16
|
||||
DTypeQ80
|
||||
DTypeQ40
|
||||
DTypeI32
|
||||
)
|
||||
|
@ -535,6 +535,10 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
||||
cdtype = C.GGML_TYPE_F32
|
||||
case ml.DTypeF16:
|
||||
cdtype = C.GGML_TYPE_F16
|
||||
case ml.DTypeQ80:
|
||||
cdtype = C.GGML_TYPE_Q8_0
|
||||
case ml.DTypeQ40:
|
||||
cdtype = C.GGML_TYPE_Q4_0
|
||||
case ml.DTypeI32:
|
||||
cdtype = C.GGML_TYPE_I32
|
||||
default:
|
||||
@ -680,6 +684,10 @@ func (t *Tensor) DType() ml.DType {
|
||||
return ml.DTypeF32
|
||||
case C.GGML_TYPE_F16:
|
||||
return ml.DTypeF16
|
||||
case C.GGML_TYPE_Q8_0:
|
||||
return ml.DTypeQ80
|
||||
case C.GGML_TYPE_Q4_0:
|
||||
return ml.DTypeQ40
|
||||
case C.GGML_TYPE_I32:
|
||||
return ml.DTypeI32
|
||||
default:
|
||||
|
@ -58,9 +58,9 @@ func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots
|
||||
func kvCacheTypeFromStr(s string) ml.DType {
|
||||
switch s {
|
||||
case "q8_0":
|
||||
panic("kv cache quantization not yet implemented")
|
||||
return ml.DTypeQ80
|
||||
case "q4_0":
|
||||
panic("kv cache quantization not yet implemented")
|
||||
return ml.DTypeQ40
|
||||
default:
|
||||
return ml.DTypeF16
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user