Switch back to subprocessing for llama.cpp

This should resolve a number of memory leak and stability defects by allowing us to isolate llama.cpp in a separate process and shutdown when idle, and gracefully restart if it has problems. This also serves as a first step to be able to run multiple copies to support multiple models concurrently.
2025-12-10 21:13:01 +01:00 · 2024-03-14 10:24:13 -07:00
parent 3b6a9154dd
commit 58d95cc9bd
35 changed files with 1416 additions and 1910 deletions
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -17,7 +17,6 @@ import (
 	"github.com/stretchr/testify/assert"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/version"
 )
@@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) {
 		},
 	}

-	s := Server{}
+	s := &Server{}
 	router := s.GenerateRoutes()

 	httpSrv := httptest.NewServer(router)
@@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) {

 	}
 }
-
-type MockLLM struct {
-	encoding []int
-}
-
-func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
-	return nil
-}
-
-func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
-	return llm.encoding, nil
-}
-
-func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
-	return "", nil
-}
-
-func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
-	return []float64{}, nil
-}
-
-func (llm *MockLLM) Close() {
-	// do nothing
-}