From f2e9c9aff5f59b21a5d9a9668408732b3de01e20 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 7 Aug 2025 13:49:26 -0700 Subject: [PATCH] server: Reduce gpt-oss context length for small VRAM GPUs gpt-oss works best with a context length of at least 8k. However, for GPUs with limited amount of VRAM, there is a significant performance hit to this increased context. In these cases, we switch to the Ollama default of 4k --- server/routes.go | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/server/routes.go b/server/routes.go index 991e92003e..acefea31f7 100644 --- a/server/routes.go +++ b/server/routes.go @@ -30,6 +30,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" @@ -50,11 +51,16 @@ func experimentEnabled(name string) bool { var useClient2 = experimentEnabled("client2") +// Low VRAM mode is based on the sum of total VRAM (not free) and triggers +// reduced context length on some models +var lowVRAMThreshold uint64 = 20 * format.GibiByte + var mode string = gin.DebugMode type Server struct { - addr net.Addr - sched *Scheduler + addr net.Addr + sched *Scheduler + lowVRAM bool } func init() { @@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C return nil, nil, nil, err } - // This model requires a minimum context to function effectively - if slices.Contains(model.Config.ModelFamilies, "gptoss") { + // This model is much more capable with a larger context, so set that + // unless it would penalize performance too much + if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") { opts.NumCtx = max(opts.NumCtx, 8192) } @@ -1382,6 +1389,15 @@ func Serve(ln net.Listener) error { gpus := discover.GetGPUInfo() gpus.LogDetails() + var totalVRAM uint64 + for _, gpu := range gpus { + totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead() + } + if totalVRAM < lowVRAMThreshold { + s.lowVRAM = true + slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold)) + } + err = srvr.Serve(ln) // If server is closed from the signal handler, wait for the ctx to be done // otherwise error out quickly