diff --git a/cmd/cmd.go b/cmd/cmd.go index 01eb66f9b..b863264f5 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1445,6 +1445,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_SCHED_SPREAD"], envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], + envVars["OLLAMA_KV_CACHE_TYPE"], envVars["OLLAMA_LLM_LIBRARY"], envVars["OLLAMA_GPU_OVERHEAD"], envVars["OLLAMA_LOAD_TIMEOUT"], diff --git a/discover/types.go b/discover/types.go index 19f215247..3112d003e 100644 --- a/discover/types.go +++ b/discover/types.go @@ -183,3 +183,17 @@ func (si SystemInfo) GetOptimalThreadCount() int { return coreCount } + +// For each GPU, check if it does NOT support flash attention +func (l GpuInfoList) FlashAttentionSupported() bool { + for _, gpu := range l { + supportsFA := gpu.Library == "metal" || + (gpu.Library == "cuda" && gpu.DriverMajor >= 7) || + gpu.Library == "rocm" + + if !supportsFA { + return false + } + } + return true +} diff --git a/docs/faq.md b/docs/faq.md index 0dbbb3ff4..387d752b2 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -151,7 +151,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx: -``` +```nginx server { listen 80; server_name example.com; # Replace with your domain or IP @@ -285,4 +285,28 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit ## How does Ollama load models on multiple GPUs? -Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. +When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. + +## How can I enable Flash Attention? + +Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows. To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server. + +## How can I set the quantization type for the K/V cache? + +The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled. + +To use quantized K/V cache with Ollama you can set the following environment variable: + +- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache. Default is `f16`. + +> Note: Currently this is a global option - meaning all models will run with the specified quantization type. + +The currently available K/V cache quantization types are: + +- `f16` - high precision and memory usage (default). +- `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16). +- `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes. + +How much the cache quantization impacts the model's response quality will depend on the model and the task. Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count. + +You may need to experiment with different quantization types to find the best balance between memory usage and quality. diff --git a/envconfig/config.go b/envconfig/config.go index e80c67ba3..29c7fa4ff 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -153,6 +153,8 @@ var ( Debug = Bool("OLLAMA_DEBUG") // FlashAttention enables the experimental flash attention feature. FlashAttention = Bool("OLLAMA_FLASH_ATTENTION") + // KvCacheType is the quantization type for the K/V cache. + KvCacheType = String("OLLAMA_KV_CACHE_TYPE") // NoHistory disables readline history. NoHistory = Bool("OLLAMA_NOHISTORY") // NoPrune disables pruning of model blobs on startup. @@ -234,6 +236,7 @@ func AsMap() map[string]EnvVar { ret := map[string]EnvVar{ "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"}, + "OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"}, "OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"}, diff --git a/llama/llama.go b/llama/llama.go index 468540f5b..24fa75274 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -140,7 +140,7 @@ type ContextParams struct { c C.struct_llama_context_params } -func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool) ContextParams { +func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams { params := C.llama_context_default_params() params.n_ctx = C.uint(numCtx) params.n_batch = C.uint(batchSize) @@ -149,9 +149,28 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla params.n_threads_batch = params.n_threads params.embeddings = C.bool(true) params.flash_attn = C.bool(flashAttention) + params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType)) + params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType)) + return ContextParams{c: params} } +// kvCacheTypeFromStr converts a string cache type to the corresponding GGML type value +func kvCacheTypeFromStr(s string) C.enum_ggml_type { + if s == "" { + return C.GGML_TYPE_F16 + } + + switch s { + case "q8_0": + return C.GGML_TYPE_Q8_0 + case "q4_0": + return C.GGML_TYPE_Q4_0 + default: + return C.GGML_TYPE_F16 + } +} + type Context struct { c *C.struct_llama_context numThreads int diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 9b1534e48..660e8cffa 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -850,6 +850,7 @@ func (s *Server) loadModel( lpath multiLPath, ppath string, kvSize int, + kvCacheType string, flashAttention bool, threads int, multiUserCache bool, @@ -862,7 +863,7 @@ func (s *Server) loadModel( panic(err) } - ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention) + ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType) s.lc, err = llama.NewContextWithModel(s.model, ctxParams) if err != nil { panic(err) @@ -903,6 +904,7 @@ func main() { mainGpu := flag.Int("main-gpu", 0, "Main GPU") flashAttention := flag.Bool("flash-attn", false, "Enable flash attention") kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size") + kvCacheType := flag.String("kv-cache-type", "", "quantization type for KV cache (default: f16)") port := flag.Int("port", 8080, "Port to expose the server on") threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := flag.Bool("verbose", false, "verbose output (default: disabled)") @@ -970,7 +972,7 @@ func main() { } server.ready.Add(1) - go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache) + go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache) server.cond = sync.NewCond(&server.mu) diff --git a/llm/ggml.go b/llm/ggml.go index 47ec24a1c..2710f7b75 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -360,7 +360,7 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { }, offset, nil } -func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffload uint64) { +func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) { embedding := llm.KV().EmbeddingLength() heads := llm.KV().HeadCount() headsKV := llm.KV().HeadCountKV() @@ -372,7 +372,8 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa layers := llm.Tensors().Layers() - kv = 2 * context * llm.KV().BlockCount() * (embeddingHeadsK + embeddingHeadsV) * headsKV + bytesPerElement := kvCacheBytesPerElement(kvCacheType) + kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) switch llm.KV().Architecture() { case "llama": @@ -527,3 +528,34 @@ func (llm GGML) GraphSize(context, batch uint64) (kv, partialOffload, fullOffloa return } + +// SupportsKVCacheType checks if the requested cache type is supported +func (ggml GGML) SupportsKVCacheType(cacheType string) bool { + validKVCacheTypes := []string{"f16", "q8_0", "q4_0"} + return slices.Contains(validKVCacheTypes, cacheType) +} + +// SupportsFlashAttention checks if the model supports flash attention +func (ggml GGML) SupportsFlashAttention() bool { + _, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())] + if isEmbedding { + return false + } + + // Check head counts match and are non-zero + headCountK := ggml.KV().EmbeddingHeadCountK() + headCountV := ggml.KV().EmbeddingHeadCountV() + return headCountK != 0 && headCountV != 0 && headCountK == headCountV +} + +// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type +func kvCacheBytesPerElement(cacheType string) float64 { + switch cacheType { + case "q8_0": + return 1 // 1/2 of fp16 + case "q4_0": + return 0.5 // 1/4 of fp16 + default: + return 2 // f16 (default) + } +} diff --git a/llm/memory.go b/llm/memory.go index 521ed16ff..c5d861b6a 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -123,7 +123,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, slog.Warn("model missing blk.0 layer size") } - kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) + fa := envconfig.FlashAttention() && + discover.GetGPUInfo().FlashAttentionSupported() && + ggml.SupportsFlashAttention() + + var kvct string + if fa { + requested := envconfig.KvCacheType() + if requested != "" && ggml.SupportsKVCacheType(requested) { + kvct = requested + } + } + + kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct) + + // KV is proportional to the number of layers + layerSize += kv / ggml.KV().BlockCount() + if graphPartialOffload == 0 { graphPartialOffload = ggml.KV().GQA() * kv / 6 } @@ -131,9 +147,6 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, graphFullOffload = graphPartialOffload } - // KV is proportional to the number of layers - layerSize += kv / ggml.KV().BlockCount() - // on metal there's no partial offload overhead if gpus[0].Library == "metal" { graphPartialOffload = graphFullOffload diff --git a/llm/memory_test.go b/llm/memory_test.go index 73e77d908..c4209ded5 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -15,6 +15,7 @@ import ( func TestEstimateGPULayers(t *testing.T) { t.Setenv("OLLAMA_DEBUG", "1") + t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16 modelName := "dummy" f, err := os.CreateTemp(t.TempDir(), modelName) diff --git a/llm/server.go b/llm/server.go index 2afc55629..23caa9a0a 100644 --- a/llm/server.go +++ b/llm/server.go @@ -214,15 +214,36 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter params = append(params, "--threads", strconv.Itoa(defaultThreads)) } - flashAttnEnabled := envconfig.FlashAttention() + fa := envconfig.FlashAttention() + if fa && !gpus.FlashAttentionSupported() { + slog.Warn("flash attention enabled but not supported by gpu") + fa = false + } - for _, g := range gpus { - // only cuda (compute capability 7+) and metal support flash attention - if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) { - flashAttnEnabled = false + if fa && !ggml.SupportsFlashAttention() { + slog.Warn("flash attention enabled but not supported by model") + fa = false + } + + kvct := envconfig.KvCacheType() + + if fa { + slog.Info("enabling flash attention") + params = append(params, "--flash-attn") + + // Flash Attention also supports kv cache quantization + // Enable if the requested and kv cache type is supported by the model + if kvct != "" && ggml.SupportsKVCacheType(kvct) { + params = append(params, "--kv-cache-type", kvct) + } else { + slog.Warn("kv cache type not supported by model", "type", kvct) } + } else if kvct != "" && kvct != "f16" { + slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct) + } - // mmap has issues with partial offloading on metal + // mmap has issues with partial offloading on metal + for _, g := range gpus { if g.Library == "metal" && uint64(opts.NumGPU) > 0 && uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { @@ -231,10 +252,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter } } - if flashAttnEnabled { - params = append(params, "--flash-attn") - } - // Windows CUDA should not use mmap for best performance // Linux with a model larger than free space, mmap leads to thrashing // For CPU loads we want the memory to be allocated, not FS cache