do not reload the running llm when runtime params change (#840)

- only reload the running llm if the model has changed, or the options for loading the running model have changed - rename loaded llm to runner to differentiate from loaded model image - remove logic which keeps the first system prompt in the generation context
2025-07-12 22:32:52 +02:00 · 2023-10-19 10:39:58 -04:00
parent 235e43d7f6
commit fe6f3b48f7
3 changed files with 66 additions and 86 deletions
--- a/api/types.go
+++ b/api/types.go
@ -161,15 +161,10 @@ func (r *GenerateResponse) Summary() {
 	}
 }

-type Options struct {
-	Seed int `json:"seed,omitempty"`
-
-	// Backend options
-	UseNUMA bool `json:"numa,omitempty"`
-
-	// Model options
+// Runner options which must be set when the model is loaded into memory
+type Runner struct {
+	UseNUMA            bool    `json:"numa,omitempty"`
 	NumCtx             int     `json:"num_ctx,omitempty"`
-	NumKeep            int     `json:"num_keep,omitempty"`
 	NumBatch           int     `json:"num_batch,omitempty"`
 	NumGQA             int     `json:"num_gqa,omitempty"`
 	NumGPU             int     `json:"num_gpu,omitempty"`
@ -183,8 +178,15 @@ type Options struct {
 	EmbeddingOnly      bool    `json:"embedding_only,omitempty"`
 	RopeFrequencyBase  float32 `json:"rope_frequency_base,omitempty"`
 	RopeFrequencyScale float32 `json:"rope_frequency_scale,omitempty"`
+	NumThread          int     `json:"num_thread,omitempty"`
+}

-	// Predict options
+type Options struct {
+	Runner
+
+	// Predict options used at runtime
+	NumKeep          int      `json:"num_keep,omitempty"`
+	Seed             int      `json:"seed,omitempty"`
 	NumPredict       int      `json:"num_predict,omitempty"`
 	TopK             int      `json:"top_k,omitempty"`
 	TopP             float32  `json:"top_p,omitempty"`
@ -200,8 +202,6 @@ type Options struct {
 	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
 	PenalizeNewline  bool     `json:"penalize_newline,omitempty"`
 	Stop             []string `json:"stop,omitempty"`
-
-	NumThread int `json:"num_thread,omitempty"`
 }

 var ErrInvalidOpts = fmt.Errorf("invalid options")
@ -309,20 +309,22 @@ func DefaultOptions() Options {
 		PenalizeNewline:  true,
 		Seed:             -1,

-		// options set when the model is loaded
-		NumCtx:             2048,
-		RopeFrequencyBase:  10000.0,
-		RopeFrequencyScale: 1.0,
-		NumBatch:           512,
-		NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
-		NumGQA:             1,
-		NumThread:          0, // let the runtime decide
-		LowVRAM:            false,
-		F16KV:              true,
-		UseMLock:           false,
-		UseMMap:            true,
-		UseNUMA:            false,
-		EmbeddingOnly:      true,
+		Runner: Runner{
+			// options set when the model is loaded
+			NumCtx:             2048,
+			RopeFrequencyBase:  10000.0,
+			RopeFrequencyScale: 1.0,
+			NumBatch:           512,
+			NumGPU:             -1, // -1 here indicates that NumGPU should be set dynamically
+			NumGQA:             1,
+			NumThread:          0, // let the runtime decide
+			LowVRAM:            false,
+			F16KV:              true,
+			UseMLock:           false,
+			UseMMap:            true,
+			UseNUMA:            false,
+			EmbeddingOnly:      true,
+		},
 	}
 }