mirror of
https://github.com/ollama/ollama.git
synced 2025-08-25 04:51:20 +02:00
int: add performance integration tests (#11173)
usage example: go test --tags=integration,perf -count 1 ./integration -v -timeout 1h -run TestModelsPerf 2>&1 | tee int.log cat int.log | grep MODEL_PERF_HEADER | cut -f2- -d: > perf.csv cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
This commit is contained in:
@@ -19,37 +19,6 @@ import (
|
|||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
started = time.Now()
|
|
||||||
chatModels = []string{
|
|
||||||
"granite3-moe:latest",
|
|
||||||
"granite-code:latest",
|
|
||||||
"nemotron-mini:latest",
|
|
||||||
"command-r:latest",
|
|
||||||
"gemma2:latest",
|
|
||||||
"gemma:latest",
|
|
||||||
"internlm2:latest",
|
|
||||||
"phi3.5:latest",
|
|
||||||
"phi3:latest",
|
|
||||||
// "phi:latest", // flaky, sometimes generates no response on first query
|
|
||||||
"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
|
|
||||||
"falcon:latest",
|
|
||||||
"falcon2:latest",
|
|
||||||
"minicpm-v:latest",
|
|
||||||
"mistral:latest",
|
|
||||||
"orca-mini:latest",
|
|
||||||
"llama2:latest",
|
|
||||||
"llama3.1:latest",
|
|
||||||
"llama3.2:latest",
|
|
||||||
"llama3.2-vision:latest",
|
|
||||||
"qwen2.5-coder:latest",
|
|
||||||
"qwen:latest",
|
|
||||||
"solar-pro:latest",
|
|
||||||
"codellama:latest",
|
|
||||||
"nous-hermes:latest",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestModelsGenerate(t *testing.T) {
|
func TestModelsGenerate(t *testing.T) {
|
||||||
softTimeout, hardTimeout := getTimeouts(t)
|
softTimeout, hardTimeout := getTimeouts(t)
|
||||||
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||||
@@ -70,6 +39,13 @@ func TestModelsGenerate(t *testing.T) {
|
|||||||
slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
|
slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var chatModels []string
|
||||||
|
if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
|
||||||
|
chatModels = ollamaEngineChatModels
|
||||||
|
} else {
|
||||||
|
chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
|
||||||
|
}
|
||||||
|
|
||||||
for _, model := range chatModels {
|
for _, model := range chatModels {
|
||||||
t.Run(model, func(t *testing.T) {
|
t.Run(model, func(t *testing.T) {
|
||||||
if time.Now().Sub(started) > softTimeout {
|
if time.Now().Sub(started) > softTimeout {
|
||||||
|
266
integration/model_perf_test.go
Normal file
266
integration/model_perf_test.go
Normal file
@@ -0,0 +1,266 @@
|
|||||||
|
//go:build integration && perf
|
||||||
|
|
||||||
|
package integration
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
|
"log/slog"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Models that don't work reliably with the large context prompt in this test case
|
||||||
|
longContextFlakes = []string{
|
||||||
|
"granite-code:latest",
|
||||||
|
"nemotron-mini:latest",
|
||||||
|
"falcon:latest", // 2k model
|
||||||
|
"falcon2:latest", // 2k model
|
||||||
|
"minicpm-v:latest",
|
||||||
|
"qwen:latest",
|
||||||
|
"solar-pro:latest",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
// Note: this test case can take a long time to run, particularly on models with
|
||||||
|
// large contexts. Run with -timeout set to a large value to get reasonable coverage
|
||||||
|
// Example usage:
|
||||||
|
//
|
||||||
|
// go test --tags=integration,perf -count 1 ./integration -v -timeout 90m -run TestModelsPerf 2>&1 | tee int.log
|
||||||
|
// cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv
|
||||||
|
// cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv
|
||||||
|
func TestModelsPerf(t *testing.T) {
|
||||||
|
softTimeout, hardTimeout := getTimeouts(t)
|
||||||
|
slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout)
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), hardTimeout)
|
||||||
|
defer cancel()
|
||||||
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
// TODO use info API eventually
|
||||||
|
var maxVram uint64
|
||||||
|
var err error
|
||||||
|
if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
|
||||||
|
maxVram, err = strconv.ParseUint(s, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("invalid OLLAMA_MAX_VRAM %v", err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
slog.Warn("No VRAM info available, testing all models, so larger ones might timeout...")
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := ioutil.ReadFile(filepath.Join("testdata", "shakespeare.txt"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to open test data file: %s", err)
|
||||||
|
}
|
||||||
|
longPrompt := "summarize the following: " + string(data)
|
||||||
|
|
||||||
|
var chatModels []string
|
||||||
|
if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" {
|
||||||
|
chatModels = ollamaEngineChatModels
|
||||||
|
} else {
|
||||||
|
chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, model := range chatModels {
|
||||||
|
t.Run(model, func(t *testing.T) {
|
||||||
|
if time.Now().Sub(started) > softTimeout {
|
||||||
|
t.Skip("skipping remaining tests to avoid excessive runtime")
|
||||||
|
}
|
||||||
|
if err := PullIfMissing(ctx, client, model); err != nil {
|
||||||
|
t.Fatalf("pull failed %s", err)
|
||||||
|
}
|
||||||
|
var maxContext int
|
||||||
|
|
||||||
|
resp, err := client.Show(ctx, &api.ShowRequest{Model: model})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("show failed: %s", err)
|
||||||
|
}
|
||||||
|
arch := resp.ModelInfo["general.architecture"].(string)
|
||||||
|
maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))
|
||||||
|
|
||||||
|
if maxVram > 0 {
|
||||||
|
resp, err := client.List(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("list models failed %v", err)
|
||||||
|
}
|
||||||
|
for _, m := range resp.Models {
|
||||||
|
// For these tests we want to exercise a some amount of overflow on the CPU
|
||||||
|
if m.Name == model && float32(m.Size)*0.75 > float32(maxVram) {
|
||||||
|
t.Skipf("model %s is too large %s for available VRAM %s", model, format.HumanBytes(m.Size), format.HumanBytes(int64(maxVram)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slog.Info("scneario", "model", model, "max_context", maxContext)
|
||||||
|
loaded := false
|
||||||
|
defer func() {
|
||||||
|
// best effort unload once we're done with the model
|
||||||
|
if loaded {
|
||||||
|
client.Generate(ctx, &api.GenerateRequest{Model: model, KeepAlive: &api.Duration{Duration: 0}}, func(rsp api.GenerateResponse) error { return nil })
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Some models don't handle the long context data well so skip them to avoid flaky test results
|
||||||
|
longContextFlake := false
|
||||||
|
for _, flake := range longContextFlakes {
|
||||||
|
if model == flake {
|
||||||
|
longContextFlake = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// iterate through a few context sizes for coverage without excessive runtime
|
||||||
|
var contexts []int
|
||||||
|
keepGoing := true
|
||||||
|
if maxContext > 16384 {
|
||||||
|
contexts = []int{4096, 8192, 16384, maxContext}
|
||||||
|
} else if maxContext > 8192 {
|
||||||
|
contexts = []int{4096, 8192, maxContext}
|
||||||
|
} else if maxContext > 4096 {
|
||||||
|
contexts = []int{4096, maxContext}
|
||||||
|
} else if maxContext > 0 {
|
||||||
|
contexts = []int{maxContext}
|
||||||
|
} else {
|
||||||
|
t.Fatal("unknown max context size")
|
||||||
|
}
|
||||||
|
for _, numCtx := range contexts {
|
||||||
|
if !keepGoing && numCtx > 8192 { // Always try up to 8k before bailing out
|
||||||
|
break
|
||||||
|
}
|
||||||
|
skipLongPrompt := false
|
||||||
|
|
||||||
|
// Workaround bug 11172 temporarily...
|
||||||
|
maxPrompt := longPrompt
|
||||||
|
// If we fill the context too full with the prompt, many models
|
||||||
|
// quickly hit context shifting and go bad.
|
||||||
|
if len(maxPrompt) > numCtx*2 { // typically yields ~1/2 full context
|
||||||
|
maxPrompt = maxPrompt[:numCtx*2]
|
||||||
|
}
|
||||||
|
|
||||||
|
testCases := []struct {
|
||||||
|
prompt string
|
||||||
|
anyResp []string
|
||||||
|
}{
|
||||||
|
{"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}},
|
||||||
|
{maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}},
|
||||||
|
}
|
||||||
|
var gpuPercent int
|
||||||
|
for _, tc := range testCases {
|
||||||
|
if len(tc.prompt) > 100 && (longContextFlake || skipLongPrompt) {
|
||||||
|
slog.Info("skipping long prompt", "model", model, "num_ctx", numCtx, "gpu_percent", gpuPercent)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
req := api.GenerateRequest{
|
||||||
|
Model: model,
|
||||||
|
Prompt: tc.prompt,
|
||||||
|
KeepAlive: &api.Duration{Duration: 20 * time.Second}, // long enough to ensure a ps returns
|
||||||
|
Options: map[string]interface{}{
|
||||||
|
"temperature": 0,
|
||||||
|
"seed": 123,
|
||||||
|
"num_ctx": numCtx,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
atLeastOne := false
|
||||||
|
var resp api.GenerateResponse
|
||||||
|
|
||||||
|
stream := false
|
||||||
|
req.Stream = &stream
|
||||||
|
|
||||||
|
// Avoid potentially getting stuck indefinitely
|
||||||
|
limit := 5 * time.Minute
|
||||||
|
genCtx, cancel := context.WithDeadlineCause(
|
||||||
|
ctx,
|
||||||
|
time.Now().Add(limit),
|
||||||
|
fmt.Errorf("generate on model %s with ctx %d took longer than %v", model, numCtx, limit),
|
||||||
|
)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
err = client.Generate(genCtx, &req, func(rsp api.GenerateResponse) error {
|
||||||
|
resp = rsp
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
// Avoid excessive test runs, but don't consider a failure with massive context
|
||||||
|
if numCtx > 16384 && strings.Contains(err.Error(), "took longer") {
|
||||||
|
slog.Warn("max context was taking too long, skipping", "error", err)
|
||||||
|
keepGoing = false
|
||||||
|
skipLongPrompt = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t.Fatalf("generate error: ctx:%d err:%s", numCtx, err)
|
||||||
|
}
|
||||||
|
loaded = true
|
||||||
|
for _, expResp := range tc.anyResp {
|
||||||
|
if strings.Contains(strings.ToLower(resp.Response), expResp) {
|
||||||
|
atLeastOne = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !atLeastOne {
|
||||||
|
t.Fatalf("response didn't contain expected values: ctx:%d expected:%v response:%s ", numCtx, tc.anyResp, resp.Response)
|
||||||
|
}
|
||||||
|
models, err := client.ListRunning(ctx)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to list running models", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(models.Models) > 1 {
|
||||||
|
slog.Warn("multiple models loaded, may impact performance results", "loaded", models.Models)
|
||||||
|
}
|
||||||
|
for _, m := range models.Models {
|
||||||
|
if m.Name == model {
|
||||||
|
if m.SizeVRAM == 0 {
|
||||||
|
slog.Info("Model fully loaded into CPU")
|
||||||
|
gpuPercent = 0
|
||||||
|
keepGoing = false
|
||||||
|
skipLongPrompt = true
|
||||||
|
} else if m.SizeVRAM == m.Size {
|
||||||
|
slog.Info("Model fully loaded into GPU")
|
||||||
|
gpuPercent = 100
|
||||||
|
} else {
|
||||||
|
sizeCPU := m.Size - m.SizeVRAM
|
||||||
|
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
|
||||||
|
gpuPercent = int(100 - cpuPercent)
|
||||||
|
slog.Info("Model split between CPU/GPU", "CPU", cpuPercent, "GPU", gpuPercent)
|
||||||
|
keepGoing = false
|
||||||
|
|
||||||
|
// Heuristic to avoid excessive test run time
|
||||||
|
if gpuPercent < 90 {
|
||||||
|
skipLongPrompt = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n",
|
||||||
|
"MODEL",
|
||||||
|
"CONTEXT",
|
||||||
|
"GPU PERCENT",
|
||||||
|
"PROMPT COUNT",
|
||||||
|
"LOAD TIME",
|
||||||
|
"PROMPT EVAL TPS",
|
||||||
|
"EVAL TPS",
|
||||||
|
)
|
||||||
|
fmt.Fprintf(os.Stderr, "MODEL_PERF_DATA:%s,%d,%d,%d,%0.2f,%0.2f,%0.2f\n",
|
||||||
|
model,
|
||||||
|
numCtx,
|
||||||
|
gpuPercent,
|
||||||
|
resp.PromptEvalCount,
|
||||||
|
float64(resp.LoadDuration)/1000000000.0,
|
||||||
|
float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0),
|
||||||
|
float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
124456
integration/testdata/shakespeare.txt
vendored
Normal file
124456
integration/testdata/shakespeare.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@@ -32,6 +32,48 @@ const (
|
|||||||
smol = "llama3.2:1b"
|
smol = "llama3.2:1b"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
started = time.Now()
|
||||||
|
|
||||||
|
// Note: add newer models at the top of the list to test them first
|
||||||
|
ollamaEngineChatModels = []string{
|
||||||
|
"gemma3n:e2b",
|
||||||
|
"mistral-small3.2:latest",
|
||||||
|
"deepseek-r1:1.5b",
|
||||||
|
"llama3.2-vision:latest",
|
||||||
|
"qwen2.5-coder:latest",
|
||||||
|
"qwen2.5vl:3b",
|
||||||
|
"qwen3:0.6b", // dense
|
||||||
|
"qwen3:30b", // MOE
|
||||||
|
"gemma3:1b",
|
||||||
|
"llama3.1:latest",
|
||||||
|
"llama3.2:latest",
|
||||||
|
"gemma2:latest",
|
||||||
|
"minicpm-v:latest", // arch=qwen2
|
||||||
|
"granite-code:latest", // arch=llama
|
||||||
|
}
|
||||||
|
llamaRunnerChatModels = []string{
|
||||||
|
"mistral:latest",
|
||||||
|
"falcon3:latest",
|
||||||
|
"granite3-moe:latest",
|
||||||
|
"command-r:latest",
|
||||||
|
"nemotron-mini:latest",
|
||||||
|
"phi3.5:latest",
|
||||||
|
"solar-pro:latest",
|
||||||
|
"internlm2:latest",
|
||||||
|
"codellama:latest", // arch=llama
|
||||||
|
"phi3:latest",
|
||||||
|
"falcon2:latest",
|
||||||
|
"gemma:latest",
|
||||||
|
"llama2:latest",
|
||||||
|
"nous-hermes:latest",
|
||||||
|
"orca-mini:latest",
|
||||||
|
"qwen:latest",
|
||||||
|
"stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs
|
||||||
|
"falcon:latest",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
func Init() {
|
func Init() {
|
||||||
lifecycle.InitLogging()
|
lifecycle.InitLogging()
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user