From f8a6e8881975b2964aa2179e74c4426b4a455d0f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 11 Jul 2025 12:21:54 -0700 Subject: [PATCH] Only load supported models on new engine (#11362) * Only load supported models on new engine Verify the model is supported before trying to load * int: testcase for all library models --- integration/library_models_test.go | 57 +++++++++ integration/utils_test.go | 185 +++++++++++++++++++++++++++++ model/models/llama/model.go | 9 ++ model/models/qwen2/model.go | 10 ++ 4 files changed, 261 insertions(+) create mode 100644 integration/library_models_test.go diff --git a/integration/library_models_test.go b/integration/library_models_test.go new file mode 100644 index 0000000000..cdf65efc85 --- /dev/null +++ b/integration/library_models_test.go @@ -0,0 +1,57 @@ +//go:build integration && library + +package integration + +import ( + "context" + "log/slog" + "testing" + "time" + + "github.com/ollama/ollama/api" +) + +// First run of this scenario on a target system will take a long time to download +// ~1.5TB of models. Set a sufficiently large -timeout for your network speed +func TestLibraryModelsGenerate(t *testing.T) { + softTimeout, hardTimeout := getTimeouts(t) + slog.Info("Setting timeouts", "soft", softTimeout, "hard", hardTimeout) + ctx, cancel := context.WithTimeout(context.Background(), hardTimeout) + defer cancel() + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + chatModels := libraryChatModels + for _, model := range chatModels { + t.Run(model, func(t *testing.T) { + if time.Now().Sub(started) > softTimeout { + t.Skip("skipping remaining tests to avoid excessive runtime") + } + if err := PullIfMissing(ctx, client, model); err != nil { + t.Fatalf("pull failed %s", err) + } + req := api.GenerateRequest{ + Model: model, + Prompt: "why is the sky blue?", + KeepAlive: &api.Duration{Duration: 10 * time.Second}, + Options: map[string]interface{}{ + "temperature": 0.1, + "seed": 123, + }, + } + anyResp := []string{"rayleigh", "scatter", "atmosphere", "nitrogen", "oxygen", "wavelength"} + // Special cases + if model == "duckdb-nsql" { + anyResp = []string{"select", "from"} + } else if model == "granite3-guardian" || model == "shieldgemma" || model == "llama-guard3" || model == "bespoke-minicheck" { + anyResp = []string{"yes", "no", "safe", "unsafe"} + } else if model == "openthinker" || model == "nexusraven" { + anyResp = []string{"plugin", "im_sep", "components", "function call"} + } else if model == "starcoder" || model == "starcoder2" || model == "magicoder" || model == "deepseek-coder" { + req.Prompt = "def fibonacci():" + anyResp = []string{"f(n)", "sequence", "n-1", "main()", "__main__", "while"} + } + DoGenerate(ctx, t, client, req, anyResp, 120*time.Second, 30*time.Second) + }) + } +} diff --git a/integration/utils_test.go b/integration/utils_test.go index c76af59ccc..3d726123b8 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -72,6 +72,187 @@ var ( "stablelm2:latest", // Predictions are off, crashes on small VRAM GPUs "falcon:latest", } + + // Some library models are quite large - ensure large VRAM and sufficient disk space + // before running scenarios based on this set + libraryChatModels = []string{ + "alfred", + "athene-v2", + "aya-expanse", + "aya", + "bakllava", + "bespoke-minicheck", + "codebooga", + "codegeex4", + "codegemma", + "codellama", + "codeqwen", + "codestral", + "codeup", + "cogito", + "command-a", + "command-r-plus", + "command-r", + "command-r7b-arabic", + "command-r7b", + "dbrx", + "deepcoder", + "deepscaler", + "deepseek-coder-v2", + "deepseek-coder", + "deepseek-llm", + "deepseek-r1", + // "deepseek-v2.5", // requires 155 GB VRAM + "deepseek-v2", + // "deepseek-v3", // requires 482 GB VRAM + "devstral", + "dolphin-llama3", + "dolphin-mistral", + "dolphin-mixtral", + "dolphin-phi", + "dolphin3", + "dolphincoder", + "duckdb-nsql", + "everythinglm", + "exaone-deep", + "exaone3.5", + "falcon", + "falcon2", + "falcon3", + "firefunction-v2", + "gemma", + "gemma2", + "gemma3", + "gemma3n", + "glm4", + "goliath", + "granite-code", + "granite3-dense", + "granite3-guardian", + "granite3-moe", + "granite3.1-dense", + "granite3.1-moe", + "granite3.2-vision", + "granite3.2", + "granite3.3", + "hermes3", + "internlm2", + "llama-guard3", + "llama-pro", + "llama2-chinese", + "llama2-uncensored", + "llama2", + "llama3-chatqa", + "llama3-gradient", + "llama3-groq-tool-use", + "llama3.1", + "llama3.2-vision", + "llama3.2", + "llama3.3", + "llama3", + "llama4", + "llava-llama3", + "llava-phi3", + "llava", + "magicoder", + "magistral", + "marco-o1", + "mathstral", + "meditron", + "medllama2", + "megadolphin", + "minicpm-v", + "mistral-large", + "mistral-nemo", + "mistral-openorca", + "mistral-small", + "mistral-small3.1", + "mistral-small3.2", + "mistral", + "mistrallite", + "mixtral", + "moondream", + "nemotron-mini", + "nemotron", + "neural-chat", + "nexusraven", + "notus", + "nous-hermes", + "nous-hermes2-mixtral", + "nous-hermes2", + "nuextract", + "olmo2", + "open-orca-platypus2", + "openchat", + "opencoder", + "openhermes", + "openthinker", + "orca-mini", + "orca2", + // "phi", // unreliable + "phi3.5", + "phi3", + "phi4-mini-reasoning", + "phi4-mini", + "phi4-reasoning", + "phi4", + "phind-codellama", + "qwen", + "qwen2-math", + "qwen2.5-coder", + "qwen2.5", + "qwen2.5vl", + "qwen2", + "qwen3:0.6b", // dense + "qwen3:30b", // MOE + "qwq", + "r1-1776", + "reader-lm", + "reflection", + "sailor2", + "samantha-mistral", + "shieldgemma", + "smallthinker", + "smollm", + "smollm2", + "solar-pro", + "solar", + "sqlcoder", + "stable-beluga", + "stable-code", + "stablelm-zephyr", + "stablelm2", + "starcoder", + "starcoder2", + "starling-lm", + "tinydolphin", + "tinyllama", + "tulu3", + "vicuna", + "wizard-math", + "wizard-vicuna-uncensored", + "wizard-vicuna", + "wizardcoder", + "wizardlm-uncensored", + "wizardlm2", + "xwinlm", + "yarn-llama2", + "yarn-mistral", + "yi-coder", + "yi", + "zephyr", + } + libraryEmbedModels = []string{ + "all-minilm", + "bge-large", + "bge-m3", + "granite-embedding", + "mxbai-embed-large", + "nomic-embed-text", + "paraphrase-multilingual", + "snowflake-arctic-embed", + "snowflake-arctic-embed2", + } ) func Init() { @@ -313,6 +494,10 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap t.Errorf("generate stalled. Response so far:%s", buf.String()) } case <-done: + if genErr != nil && strings.Contains(genErr.Error(), "model requires more system memory") { + slog.Warn("model is too large for the target test system", "model", genReq.Model, "error", genErr) + return + } require.NoError(t, genErr, "failed with %s request prompt %s ", genReq.Model, genReq.Prompt) // Verify the response contains the expected data response := buf.String() diff --git a/model/models/llama/model.go b/model/models/llama/model.go index 3cf782d00f..77d8f36d3c 100644 --- a/model/models/llama/model.go +++ b/model/models/llama/model.go @@ -2,6 +2,7 @@ package llama import ( "cmp" + "fmt" "math" "github.com/ollama/ollama/fs" @@ -33,6 +34,14 @@ type Model struct { } func New(c fs.Config) (model.Model, error) { + // This model currently only supports the gpt2 tokenizer + if c.String("tokenizer.ggml.model") == "llama" { + return nil, fmt.Errorf("unsupported tokenizer: llama") + } + // Best effort detection of library/deepseek-coder model(s) which are incompatible + if c.String("general.name") == "deepseek-ai" { + return nil, fmt.Errorf("unsupported model: %s", c.String("general.name")) + } m := Model{ BytePairEncoding: model.NewBytePairEncoding( c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), diff --git a/model/models/qwen2/model.go b/model/models/qwen2/model.go index 42338d0d69..3c662f0682 100644 --- a/model/models/qwen2/model.go +++ b/model/models/qwen2/model.go @@ -2,7 +2,9 @@ package qwen2 import ( "cmp" + "fmt" "math" + "strings" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/kvcache" @@ -126,6 +128,14 @@ func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor } func New(c fs.Config) (model.Model, error) { + // This model currently only supports the gpt2 tokenizer + if c.String("tokenizer.ggml.model") == "llama" { + return nil, fmt.Errorf("unsupported tokenizer: llama") + } + // detect library/qwen model(s) which are incompatible + if strings.HasPrefix(c.String("general.name"), "Qwen2-beta") { + return nil, fmt.Errorf("unsupported model: %s", c.String("general.name")) + } m := Model{ Layers: make([]DecoderLayer, c.Uint("block_count")), BytePairEncoding: model.NewBytePairEncoding(