From f2e9c9aff5f59b21a5d9a9668408732b3de01e20 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 7 Aug 2025 13:49:26 -0700
Subject: [PATCH] server: Reduce gpt-oss context length for small VRAM GPUs

gpt-oss works best with a context length of at least 8k. However,
for GPUs with limited amount of VRAM, there is a significant
performance hit to this increased context. In these cases, we
switch to the Ollama default of 4k
---
 server/routes.go | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 991e92003e..acefea31f7 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -30,6 +30,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
@@ -50,11 +51,16 @@ func experimentEnabled(name string) bool {
 
 var useClient2 = experimentEnabled("client2")
 
+// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
+// reduced context length on some models
+var lowVRAMThreshold uint64 = 20 * format.GibiByte
+
 var mode string = gin.DebugMode
 
 type Server struct {
-	addr  net.Addr
-	sched *Scheduler
+	addr    net.Addr
+	sched   *Scheduler
+	lowVRAM bool
 }
 
 func init() {
@@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
 		return nil, nil, nil, err
 	}
 
-	// This model requires a minimum context to function effectively
-	if slices.Contains(model.Config.ModelFamilies, "gptoss") {
+	// This model is much more capable with a larger context, so set that
+	// unless it would penalize performance too much
+	if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") {
 		opts.NumCtx = max(opts.NumCtx, 8192)
 	}
 
@@ -1382,6 +1389,15 @@ func Serve(ln net.Listener) error {
 	gpus := discover.GetGPUInfo()
 	gpus.LogDetails()
 
+	var totalVRAM uint64
+	for _, gpu := range gpus {
+		totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
+	}
+	if totalVRAM < lowVRAMThreshold {
+		s.lowVRAM = true
+		slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
+	}
+
 	err = srvr.Serve(ln)
 	// If server is closed from the signal handler, wait for the ctx to be done
 	// otherwise error out quickly