diff --git a/integration/README.md b/integration/README.md index e52ba71ee7..1dfd0e359b 100644 --- a/integration/README.md +++ b/integration/README.md @@ -12,3 +12,6 @@ The integration tests have 2 modes of operating. > [!IMPORTANT] > Before running the tests locally without the "test existing" setting, compile ollama from the top of the source tree `go build .` in addition to GPU support with cmake if applicable on your platform. The integration tests expect to find an ollama binary at the top of the tree. + + +Many tests use a default small model suitable to run on many systems. You can override this default model by setting `OLLAMA_TEST_DEFAULT_MODEL` \ No newline at end of file diff --git a/integration/api_test.go b/integration/api_test.go index c39192c99d..48572085d8 100644 --- a/integration/api_test.go +++ b/integration/api_test.go @@ -22,13 +22,12 @@ func TestAPIGenerate(t *testing.T) { // Set up the test data req := api.GenerateRequest{ Model: smol, - Prompt: "why is the sky blue? be brief", + Prompt: blueSkyPrompt, Options: map[string]interface{}{ "temperature": 0, "seed": 123, }, } - anyResp := []string{"rayleigh", "scattering"} client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() @@ -120,14 +119,14 @@ func TestAPIGenerate(t *testing.T) { // Verify the response contains the expected data response := buf.String() atLeastOne := false - for _, resp := range anyResp { + for _, resp := range blueSkyExpected { if strings.Contains(strings.ToLower(response), resp) { atLeastOne = true break } } if !atLeastOne { - t.Errorf("none of %v found in %s", anyResp, response) + t.Errorf("none of %v found in %s", blueSkyExpected, response) } case <-ctx.Done(): t.Error("outer test context done while waiting for generate") @@ -181,7 +180,7 @@ func TestAPIChat(t *testing.T) { Messages: []api.Message{ { Role: "user", - Content: "why is the sky blue? be brief", + Content: blueSkyPrompt, }, }, Options: map[string]interface{}{ @@ -189,7 +188,6 @@ func TestAPIChat(t *testing.T) { "seed": 123, }, } - anyResp := []string{"rayleigh", "scattering"} client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() @@ -279,14 +277,14 @@ func TestAPIChat(t *testing.T) { // Verify the response contains the expected data response := buf.String() atLeastOne := false - for _, resp := range anyResp { + for _, resp := range blueSkyExpected { if strings.Contains(strings.ToLower(response), resp) { atLeastOne = true break } } if !atLeastOne { - t.Errorf("none of %v found in %s", anyResp, response) + t.Errorf("none of %v found in %s", blueSkyExpected, response) } case <-ctx.Done(): t.Error("outer test context done while waiting for chat") diff --git a/integration/basic_test.go b/integration/basic_test.go index 60cff172b5..0a6b9253d7 100644 --- a/integration/basic_test.go +++ b/integration/basic_test.go @@ -19,14 +19,14 @@ func TestBlueSky(t *testing.T) { // Set up the test data req := api.GenerateRequest{ Model: smol, - Prompt: "why is the sky blue?", + Prompt: blueSkyPrompt, Stream: &stream, Options: map[string]any{ "temperature": 0, "seed": 123, }, } - GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"}) + GenerateTestHelper(ctx, t, req, blueSkyExpected) } func TestUnicode(t *testing.T) { @@ -110,12 +110,12 @@ func TestUnicodeModelDir(t *testing.T) { req := api.GenerateRequest{ Model: smol, - Prompt: "why is the sky blue?", + Prompt: blueSkyPrompt, Stream: &stream, Options: map[string]any{ "temperature": 0, "seed": 123, }, } - GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"}) + GenerateTestHelper(ctx, t, req, blueSkyExpected) } diff --git a/integration/context_test.go b/integration/context_test.go index 15c1578589..9d13f7acb9 100644 --- a/integration/context_test.go +++ b/integration/context_test.go @@ -63,11 +63,11 @@ func TestContextExhaustion(t *testing.T) { if err := PullIfMissing(ctx, client, req.Model); err != nil { t.Fatalf("PullIfMissing failed: %v", err) } - DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water"}, 120*time.Second, 10*time.Second) + DoGenerate(ctx, t, client, req, []string{"once", "upon", "lived", "sunny", "cloudy", "clear", "water", "time", "travel", "world"}, 120*time.Second, 10*time.Second) } // Send multiple generate requests with prior context and ensure the response is coherant and expected -func TestGenerateWithHistory(t *testing.T) { +func TestParallelGenerateWithHistory(t *testing.T) { modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model req, resp := GenerateRequests() numParallel := 2 @@ -113,8 +113,48 @@ func TestGenerateWithHistory(t *testing.T) { wg.Wait() } +// Send generate requests with prior context and ensure the response is coherant and expected +func TestGenerateWithHistory(t *testing.T) { + req := api.GenerateRequest{ + Model: smol, + Prompt: rainbowPrompt, + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, + Options: map[string]any{ + "num_ctx": 16384, + }, + } + + softTimeout, hardTimeout := getTimeouts(t) + ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) + defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + // Get the server running (if applicable) warm the model up with a single initial request + slog.Info("loading", "model", req.Model) + err := client.Generate(ctx, + &api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: req.Options}, + func(response api.GenerateResponse) error { return nil }, + ) + if err != nil { + t.Fatalf("failed to load model %s: %s", req.Model, err) + } + + req.Context = DoGenerate(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second) + + for i := 0; i < len(rainbowFollowups); i++ { + req.Prompt = rainbowFollowups[i] + if time.Now().Sub(started) > softTimeout { + slog.Info("exceeded soft timeout, winding down test") + return + } + req.Context = DoGenerate(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second) + } +} + // Send multiple chat requests with prior context and ensure the response is coherant and expected -func TestChatWithHistory(t *testing.T) { +func TestParallelChatWithHistory(t *testing.T) { modelOverride := ollamaEngineChatModels[0] // Most recent ollama engine model req, resp := ChatRequests() numParallel := 2 @@ -164,3 +204,55 @@ func TestChatWithHistory(t *testing.T) { } wg.Wait() } + +// Send generate requests with prior context and ensure the response is coherant and expected +func TestChatWithHistory(t *testing.T) { + req := api.ChatRequest{ + Model: smol, + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, + Options: map[string]any{ + "num_ctx": 16384, + }, + Messages: []api.Message{ + { + Role: "user", + Content: rainbowPrompt, + }, + }, + } + + softTimeout, hardTimeout := getTimeouts(t) + ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) + defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + // Get the server running (if applicable) warm the model up with a single initial request + slog.Info("loading", "model", req.Model) + err := client.Generate(ctx, + &api.GenerateRequest{Model: req.Model, KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: req.Options}, + func(response api.GenerateResponse) error { return nil }, + ) + if err != nil { + t.Fatalf("failed to load model %s: %s", req.Model, err) + } + + assistant := DoChat(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second) + + for i := 0; i < len(rainbowFollowups); i++ { + if time.Now().Sub(started) > softTimeout { + slog.Info("exceeded soft timeout, winding down test") + return + } + req.Messages = append(req.Messages, + *assistant, + api.Message{Role: "user", Content: rainbowFollowups[i]}, + ) + + assistant = DoChat(ctx, t, client, req, rainbowExpected, 30*time.Second, 20*time.Second) + if assistant == nil { + t.Fatalf("didn't get an assistant response for context") + } + } +} diff --git a/integration/library_models_test.go b/integration/library_models_test.go index cdf65efc85..49e1097b86 100644 --- a/integration/library_models_test.go +++ b/integration/library_models_test.go @@ -4,7 +4,9 @@ package integration import ( "context" + "fmt" "log/slog" + "os" "testing" "time" @@ -20,6 +22,7 @@ func TestLibraryModelsGenerate(t *testing.T) { defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() + targetArch := os.Getenv("OLLAMA_TEST_ARCHITECTURE") chatModels := libraryChatModels for _, model := range chatModels { @@ -30,16 +33,26 @@ func TestLibraryModelsGenerate(t *testing.T) { if err := PullIfMissing(ctx, client, model); err != nil { t.Fatalf("pull failed %s", err) } + if targetArch != "" { + resp, err := client.Show(ctx, &api.ShowRequest{Name: model}) + if err != nil { + t.Fatalf("unable to show model: %s", err) + } + arch := resp.ModelInfo["general.architecture"].(string) + if arch != targetArch { + t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch)) + } + } req := api.GenerateRequest{ Model: model, - Prompt: "why is the sky blue?", + Prompt: blueSkyPrompt, KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "temperature": 0.1, "seed": 123, }, } - anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"} + anyResp := blueSkyExpected // Special cases if model == "duckdb-nsql" { anyResp = []string{"select", "from"} diff --git a/integration/model_arch_test.go b/integration/model_arch_test.go index 9fc2e01dd0..721d95c540 100644 --- a/integration/model_arch_test.go +++ b/integration/model_arch_test.go @@ -68,14 +68,13 @@ func TestModelsGenerate(t *testing.T) { // TODO - fiddle with context size req := api.GenerateRequest{ Model: model, - Prompt: "why is the sky blue?", + Prompt: blueSkyPrompt, Options: map[string]interface{}{ "temperature": 0, "seed": 123, }, } - anyResp := []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"} - DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second) + DoGenerate(ctx, t, client, req, blueSkyExpected, 120*time.Second, 30*time.Second) }) } } diff --git a/integration/model_perf_test.go b/integration/model_perf_test.go index 759e8b9a26..3d6ba92396 100644 --- a/integration/model_perf_test.go +++ b/integration/model_perf_test.go @@ -40,6 +40,18 @@ var ( // cat int.log | grep MODEL_PERF_HEADER | head -1| cut -f2- -d: > perf.csv // cat int.log | grep MODEL_PERF_DATA | cut -f2- -d: >> perf.csv func TestModelsPerf(t *testing.T) { + if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" { + doModelPerfTest(t, ollamaEngineChatModels) + } else { + doModelPerfTest(t, append(ollamaEngineChatModels, llamaRunnerChatModels...)) + } +} + +func TestLibraryModelsPerf(t *testing.T) { + doModelPerfTest(t, libraryChatModels) +} + +func doModelPerfTest(t *testing.T, chatModels []string) { softTimeout, hardTimeout := getTimeouts(t) slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) @@ -65,14 +77,12 @@ func TestModelsPerf(t *testing.T) { } longPrompt := "summarize the following: " + string(data) - var chatModels []string - if s := os.Getenv("OLLAMA_NEW_ENGINE"); s != "" { - chatModels = ollamaEngineChatModels - } else { - chatModels = append(ollamaEngineChatModels, llamaRunnerChatModels...) - } + targetArch := os.Getenv("OLLAMA_TEST_ARCHITECTURE") for _, model := range chatModels { + if !strings.Contains(model, ":") { + model = model + ":latest" + } t.Run(model, func(t *testing.T) { if time.Now().Sub(started) > softTimeout { t.Skip("skipping remaining tests to avoid excessive runtime") @@ -88,6 +98,9 @@ func TestModelsPerf(t *testing.T) { } arch := resp.ModelInfo["general.architecture"].(string) maxContext = int(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64)) + if targetArch != "" && arch != targetArch { + t.Skip(fmt.Sprintf("Skipping %s architecture %s != %s", model, arch, targetArch)) + } if maxVram > 0 { resp, err := client.List(ctx) @@ -151,8 +164,8 @@ func TestModelsPerf(t *testing.T) { prompt string anyResp []string }{ - {"why is the sky blue?", []string{"rayleigh", "scattering", "atmosphere", "nitrogen", "oxygen"}}, - {maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy"}}, + {blueSkyPrompt, blueSkyExpected}, + {maxPrompt, []string{"shakespeare", "oppression", "sorrows", "gutenberg", "child", "license", "sonnet", "melancholy", "love", "sorrow", "beauty"}}, } var gpuPercent int for _, tc := range testCases { @@ -241,11 +254,12 @@ func TestModelsPerf(t *testing.T) { } } } + // Round the logged prompt count for comparisons across versions/configurations which can vary slightly fmt.Fprintf(os.Stderr, "MODEL_PERF_HEADER:%s,%s,%s,%s,%s,%s,%s\n", "MODEL", "CONTEXT", "GPU PERCENT", - "PROMPT COUNT", + "APPROX PROMPT COUNT", "LOAD TIME", "PROMPT EVAL TPS", "EVAL TPS", @@ -254,7 +268,7 @@ func TestModelsPerf(t *testing.T) { model, numCtx, gpuPercent, - resp.PromptEvalCount, + (resp.PromptEvalCount/10)*10, float64(resp.LoadDuration)/1000000000.0, float64(resp.PromptEvalCount)/(float64(resp.PromptEvalDuration)/1000000000.0), float64(resp.EvalCount)/(float64(resp.EvalDuration)/1000000000.0), diff --git a/integration/quantization_test.go b/integration/quantization_test.go index af9da0b625..3056474963 100644 --- a/integration/quantization_test.go +++ b/integration/quantization_test.go @@ -76,7 +76,7 @@ func TestQuantization(t *testing.T) { stream := true genReq := api.GenerateRequest{ Model: newName, - Prompt: "why is the sky blue?", + Prompt: blueSkyPrompt, KeepAlive: &api.Duration{Duration: 3 * time.Second}, Options: map[string]any{ "seed": 42, @@ -88,14 +88,13 @@ func TestQuantization(t *testing.T) { // Some smaller quantizations can cause models to have poor quality // or get stuck in repetition loops, so we stop as soon as we have any matches - anyResp := []string{"rayleigh", "scattering", "day", "sun", "moon", "color", "nitrogen", "oxygen"} reqCtx, reqCancel := context.WithCancel(ctx) atLeastOne := false var buf bytes.Buffer genfn := func(response api.GenerateResponse) error { buf.Write([]byte(response.Response)) fullResp := strings.ToLower(buf.String()) - for _, resp := range anyResp { + for _, resp := range blueSkyExpected { if strings.Contains(fullResp, resp) { atLeastOne = true t.Log(fullResp) diff --git a/integration/utils_test.go b/integration/utils_test.go index 7901fed3f4..f8ec13f394 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -256,13 +256,29 @@ var ( "snowflake-arctic-embed", "snowflake-arctic-embed2", } + + blueSkyPrompt = "why is the sky blue? Be brief but factual in your reply" + blueSkyExpected = []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength", "interact"} + + rainbowPrompt = "how do rainbows form? Be brief but factual in your reply" + rainbowFollowups = []string{ + "Explain the physics involved in them. Be breif in your reply", + "Explain the chemistry involved in them. Be breif in your reply", + "Explain the quantum mechanics involved in them. Be breif in your reply", + "What are common myths related to them? Be brief in your reply", + "What are common fairytales related to them? Be brief in your reply", + "Can they form if there is no rain? Be breif in your reply", + "Can they form if there are no clouds? Be breif in your reply", + "Do they happen on other planets? Be brief in your reply", + } + rainbowExpected = []string{"water", "droplet", "mist", "glow", "refracted", "reflect", "color", "spectrum", "frequency", "end", "gold", "fortune", "blessing", "prosperity"} ) func init() { lifecycle.InitLogging() - custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL") + custom := os.Getenv("OLLAMA_TEST_DEFAULT_MODEL") if custom != "" { - slog.Info("setting smol test model to " + custom) + slog.Info("setting default test model to " + custom) smol = custom } } @@ -577,11 +593,11 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { }, }, [][]string{ - {"sunlight", "scattering", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorbs", "wavelength"}, - {"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigments", "particles", "iron oxide", "rust", "air", "water", "mixture", "mixing"}, - {"water", "droplet", "refracted", "reflect", "color", "spectrum"}, + {"sunlight", "scatter", "interact", "color", "surface", "depth", "red", "orange", "yellow", "absorb", "wavelength", "water", "molecule"}, + {"soil", "organic", "earth", "black", "tan", "chemical", "processes", "pigment", "particle", "iron oxide", "rust", "air", "water", "wet", "mixture", "mixing", "mineral", "element", "decomposed", "matter", "wavelength"}, + {"water", "droplet", "refract", "reflect", "color", "spectrum", "raindrop"}, {"fourth", "july", "declaration", "independence"}, - {"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor"}, + {"nitrogen", "oxygen", "carbon", "dioxide", "water", "vapor", "fluid", "particles", "gas"}, } }