int: add performance integration tests (#11173)

usage example: go test --tags=integration,perf -count 1 ./integration -v -timeout 1h -run TestModelsPerf 2>&1 | tee int.log cat int.log | grep MODEL_PERF_HEADER | cut -f2- -d: > perf.csv cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
2025-08-25 04:51:20 +02:00 · 2025-07-05 16:07:09 -07:00
parent 9d60bb44cf
commit 4f473e224c
4 changed files with 124771 additions and 31 deletions
--- a/integration/model_arch_test.go
+++ b/integration/model_arch_test.go
@@ -19,37 +19,6 @@ import (
 	"github.com/ollama/ollama/format"
 )
 var (
 	started    = time.Now()
 	chatModels = []string{
 		"granite3-moe:latest",
 		"granite-code:latest",
 		"nemotron-mini:latest",
 		"command-r:latest",
 		"gemma2:latest",
 		"gemma:latest",
 		"internlm2:latest",
 		"phi3.5:latest",
 		"phi3:latest",
 		// "phi:latest", // flaky, sometimes generates no response on first query
 		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
 		"falcon:latest",
 		"falcon2:latest",
 		"minicpm-v:latest",
 		"mistral:latest",
 		"orca-mini:latest",
 		"llama2:latest",
 		"llama3.1:latest",
 		"llama3.2:latest",
 		"llama3.2-vision:latest",
 		"qwen2.5-coder:latest",
 		"qwen:latest",
 		"solar-pro:latest",
 		"codellama:latest",
 		"nous-hermes:latest",
 	}
 )
 func TestModelsGenerate(t *testing.T) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
@@ -70,6 +39,13 @@ func TestModelsGenerate(t *testing.T) {
 		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
 	}
 	var chatModels []string
 	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
 		chatModels = ollamaEngineChatModels
 	} else {
 		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
 	}
 	for _, model := range chatModels {
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
--- a/integration/model_perf_test.go
+++ b/integration/model_perf_test.go
@@ -0,0 +1,266 @@
 //go:build integration && perf
 package integration
 import (
 	"context"
 	"fmt"
 	"io/ioutil"
 	"log/slog"
 	"math"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"testing"
 	"time"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 )
 var (
 	// Models that don't work reliably with the large context prompt in this test case
 	longContextFlakes = []string{
 		"granite-code:latest",
 		"nemotron-mini:latest",
 		"falcon:latest",  // 2k model
 		"falcon2:latest", // 2k model
 		"minicpm-v:latest",
 		"qwen:latest",
 		"solar-pro:latest",
 	}
 )
 // Note: this test case can take a long time to run, particularly on models with
 // large contexts.  Run with -timeout set to a large value to get reasonable coverage
 // Example usage:
 //
 // go test --tags=integration,perf -count 1 ./integration -v -timeout 90m -run TestModelsPerf 2>&1 | tee int.log
 // cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
 // cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
 func TestModelsPerf(t *testing.T) {
 	softTimeout, hardTimeout := getTimeouts(t)
 	slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
 	ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	// TODO use info API eventually
 	var maxVram uint64
 	var err error
 	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
 		maxVram, err = strconv.ParseUint(s, 10, 64)
 		if err != nil {
 			t.Fatalf("invalid  OLLAMA_MAX_VRAM %v", err)
 		}
 	} else {
 		slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
 	}
 	data, err := ioutil.ReadFile(filepath.Join("testdata", "shakespeare.txt"))
 	if err != nil {
 		t.Fatalf("failed to open test data file: %s", err)
 	}
 	longPrompt := "summarize the following: " + string(data)
 	var chatModels []string
 	if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
 		chatModels = ollamaEngineChatModels
 	} else {
 		chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
 	}
 	for _, model := range chatModels {
 		t.Run(model, func(t *testing.T) {
 			if time.Now().Sub(started) > softTimeout {
 				t.Skip("skipping remaining tests to avoid excessive runtime")
 			}
 			if err := PullIfMissing(ctx, client, model); err != nil {
 				t.Fatalf("pull failed %s", err)
 			}
 			var maxContext int
 			resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
 			if err != nil {
 				t.Fatalf("show failed: %s", err)
 			}
 			arch := resp.ModelInfo["general.architecture"].(string)
 			maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
 			if maxVram > 0 {
 				resp, err := client.List(ctx)
 				if err != nil {
 					t.Fatalf("list models failed %v", err)
 				}
 				for _, m := range resp.Models {
 					// For these tests we want to exercise a some amount of overflow on the CPU
 					if m.Name == model && float32(m.Size)*0.75 > float32(maxVram) {
 						t.Skipf("model %s is too large %s for available VRAM %s", model, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
 					}
 				}
 			}
 			slog.Info("scneario", "model", model, "max_context", maxContext)
 			loaded := false
 			defer func() {
 				// best effort unload once we're done with the model
 				if loaded {
 					client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
 				}
 			}()
 			// Some models don't handle the long context data well so skip them to avoid flaky test results
 			longContextFlake := false
 			for _, flake := range longContextFlakes {
 				if model == flake {
 					longContextFlake = true
 					break
 				}
 			}
 			// iterate through a few context sizes for coverage without excessive runtime
 			var contexts []int
 			keepGoing := true
 			if maxContext > 16384 {
 				contexts = []int{4096, 8192, 16384, maxContext}
 			} else if maxContext > 8192 {
 				contexts = []int{4096, 8192, maxContext}
 			} else if maxContext > 4096 {
 				contexts = []int{4096, maxContext}
 			} else if maxContext > 0 {
 				contexts = []int{maxContext}
 			} else {
 				t.Fatal("unknown max context size")
 			}
 			for _, numCtx := range contexts {
 				if !keepGoing && numCtx > 8192 { // Always try up to 8k before bailing out
 					break
 				}
 				skipLongPrompt := false
 				// Workaround bug 11172 temporarily...
 				maxPrompt := longPrompt
 				// If we fill the context too full with the prompt, many models
 				// quickly hit context shifting and go bad.
 				if len(maxPrompt) > numCtx*2 { // typically yields ~1/2 full context
 					maxPrompt = maxPrompt[:numCtx*2]
 				}
 				testCases := []struct {
 					prompt  string
 					anyResp []string
 				}{
 					{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
 					{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
 				}
 				var gpuPercent int
 				for _, tc := range testCases {
 					if len(tc.prompt) > 100 && (longContextFlake || skipLongPrompt) {
 						slog.Info("skipping long prompt", "model", model, "num_ctx", numCtx, "gpu_percent", gpuPercent)
 						continue
 					}
 					req := api.GenerateRequest{
 						Model:     model,
 						Prompt:    tc.prompt,
 						KeepAlive: &api.Duration{Duration: 20 * time.Second}, // long enough to ensure a ps returns
 						Options: map[string]interface{}{
 							"temperature": 0,
 							"seed":        123,
 							"num_ctx":     numCtx,
 						},
 					}
 					atLeastOne := false
 					var resp api.GenerateResponse
 					stream := false
 					req.Stream = &stream
 					// Avoid potentially getting stuck indefinitely
 					limit := 5 * time.Minute
 					genCtx, cancel := context.WithDeadlineCause(
 						ctx,
 						time.Now().Add(limit),
 						fmt.Errorf("generate on model %s with ctx %d took longer than %v", model, numCtx, limit),
 					)
 					defer cancel()
 					err = client.Generate(genCtx, &req, func(rsp api.GenerateResponse) error {
 						resp = rsp
 						return nil
 					})
 					if err != nil {
 						// Avoid excessive test runs, but don't consider a failure with massive context
 						if numCtx > 16384 && strings.Contains(err.Error(), "took longer") {
 							slog.Warn("max context was taking too long, skipping", "error", err)
 							keepGoing = false
 							skipLongPrompt = true
 							continue
 						}
 						t.Fatalf("generate error: ctx:%d err:%s", numCtx, err)
 					}
 					loaded = true
 					for _, expResp := range tc.anyResp {
 						if strings.Contains(strings.ToLower(resp.Response), expResp) {
 							atLeastOne = true
 							break
 						}
 					}
 					if !atLeastOne {
 						t.Fatalf("response didn't contain expected values: ctx:%d  expected:%v response:%s ", numCtx, tc.anyResp, resp.Response)
 					}
 					models, err := client.ListRunning(ctx)
 					if err != nil {
 						slog.Warn("failed to list running models", "error", err)
 						continue
 					}
 					if len(models.Models) > 1 {
 						slog.Warn("multiple models loaded, may impact performance results", "loaded", models.Models)
 					}
 					for _, m := range models.Models {
 						if m.Name == model {
 							if m.SizeVRAM == 0 {
 								slog.Info("Model fully loaded into CPU")
 								gpuPercent = 0
 								keepGoing = false
 								skipLongPrompt = true
 							} else if m.SizeVRAM == m.Size {
 								slog.Info("Model fully loaded into GPU")
 								gpuPercent = 100
 							} else {
 								sizeCPU := m.Size - m.SizeVRAM
 								cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
 								gpuPercent = int(100 - cpuPercent)
 								slog.Info("Model split between CPU/GPU", "CPU", cpuPercent, "GPU", gpuPercent)
 								keepGoing = false
 								// Heuristic to avoid excessive test run time
 								if gpuPercent < 90 {
 									skipLongPrompt = true
 								}
 							}
 						}
 					}
 					fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
 						"MODEL",
 						"CONTEXT",
 						"GPU PERCENT",
 						"PROMPT COUNT",
 						"LOAD TIME",
 						"PROMPT EVAL TPS",
 						"EVAL TPS",
 					)
 					fmt.Fprintf(os.Stderr, "MODEL_PERF_DATA:%s,%d,%d,%d,%0.2f,%0.2f,%0.2f\n",
 						model,
 						numCtx,
 						gpuPercent,
 						resp.PromptEvalCount,
 						float64(resp.LoadDuration)/1000000000.0,
 						float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
 						float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
 					)
 				}
 			}
 		})
 	}
 }
--- a/integration/testdata/shakespeare.txt
+++ b/integration/testdata/shakespeare.txt
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -32,6 +32,48 @@ const (
 	smol = "llama3.2:1b"
 )
 var (
 	started = time.Now()
 	// Note: add newer models at the top of the list to test them first
 	ollamaEngineChatModels = []string{
 		"gemma3n:e2b",
 		"mistral-small3.2:latest",
 		"deepseek-r1:1.5b",
 		"llama3.2-vision:latest",
 		"qwen2.5-coder:latest",
 		"qwen2.5vl:3b",
 		"qwen3:0.6b", // dense
 		"qwen3:30b",  // MOE
 		"gemma3:1b",
 		"llama3.1:latest",
 		"llama3.2:latest",
 		"gemma2:latest",
 		"minicpm-v:latest",    // arch=qwen2
 		"granite-code:latest", // arch=llama
 	}
 	llamaRunnerChatModels = []string{
 		"mistral:latest",
 		"falcon3:latest",
 		"granite3-moe:latest",
 		"command-r:latest",
 		"nemotron-mini:latest",
 		"phi3.5:latest",
 		"solar-pro:latest",
 		"internlm2:latest",
 		"codellama:latest", // arch=llama
 		"phi3:latest",
 		"falcon2:latest",
 		"gemma:latest",
 		"llama2:latest",
 		"nous-hermes:latest",
 		"orca-mini:latest",
 		"qwen:latest",
 		"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
 		"falcon:latest",
 	}
 )
 func Init() {
 	lifecycle.InitLogging()
 }