llm: Separate llamaServer and ollamaServer code paths

Originally, llamaServer represented old memory estimates, which could be used with either the old or new engine. ollamaServer was used only for the new estimates and new engine. Since these implementations did not map directly to engine, there was engine- specific code in common code paths. Now that new estimates are always used for the new engine, there is a direct mapping between server type and engine. This separates out most of the engine-specific code into the correct implementation to make things easier to understand.
2025-12-08 14:31:54 +01:00 · 2025-11-06 10:31:08 -08:00
parent f560bd077f
commit b13fbad0fe
1 changed files with 50 additions and 64 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -89,7 +89,6 @@ type llmServer struct {
 	done      chan error // Channel to signal when the process exits
 	status    *StatusWriter
 	options   api.Options
-	numParallel int
 	modelPath string

 	loadRequest LoadRequest       // Parameters used to initialize the runner
@@ -100,10 +99,6 @@ type llmServer struct {
 	llamaModel     *llama.Model
 	llamaModelLock *sync.Mutex

-	// textProcessor handles text encoding/decoding for the model in the Ollama engine
-	// nil if this server is running the llama.cpp based engine
-	textProcessor model.TextProcessor
-
 	totalLayers  uint64
 	loadStart    time.Time // Record how long it took the model to load
 	loadProgress float32
@@ -119,6 +114,8 @@ type llamaServer struct {

 type ollamaServer struct {
 	llmServer
+
+	textProcessor model.TextProcessor // textProcessor handles text encoding/decoding
 }

 // LoadModel will load a model from disk. The model must be in the GGML format.
@@ -242,8 +239,6 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 		loadRequest:    loadRequest,
 		llamaModel:     llamaModel,
 		llamaModelLock: &sync.Mutex{},
-		textProcessor:  textProcessor,
-		numParallel:    numParallel,
 		sem:            semaphore.NewWeighted(int64(numParallel)),
 		totalLayers:    f.KV().BlockCount() + 1,
 		loadStart:      time.Now(),
@@ -278,7 +273,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
 	}()

 	if textProcessor != nil {
-		return &ollamaServer{llmServer: s}, nil
+		return &ollamaServer{llmServer: s, textProcessor: textProcessor}, nil
 	} else {
 		return &llamaServer{llmServer: s, ggml: f}, nil
 	}
@@ -1681,69 +1676,60 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
 	return e.Embedding, nil
 }

-type TokenizeRequest struct {
-	Content string `json:"content"`
-}
-
-type TokenizeResponse struct {
-	Tokens []int `json:"tokens"`
-}
-
-func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
+func (s *llamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
 	s.llamaModelLock.Lock()
 	defer s.llamaModelLock.Unlock()

-	if s.llamaModel != nil {
+	if s.llamaModel == nil {
+		return nil, fmt.Errorf("no tokenizer configured")
+	}
+
 	return s.llamaModel.Tokenize(content, false, true)
 }
-	if s.textProcessor != nil {
+
+func (s *ollamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
 	tokens, err := s.textProcessor.Encode(content, false)
 	if err != nil {
 		return nil, err
 	}
+
 	toks := make([]int, len(tokens))
 	for i, t := range tokens {
 		toks[i] = int(t)
 	}
+
 	return toks, nil
 }
-	// not reached
-	return nil, fmt.Errorf("no tokenizer configured")
-}

-type DetokenizeRequest struct {
-	Tokens []int `json:"tokens"`
-}
-
-type DetokenizeResponse struct {
-	Content string `json:"content"`
-}
-
-func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
+func (s *llamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
 	s.llamaModelLock.Lock()
 	defer s.llamaModelLock.Unlock()

-	if s.llamaModel != nil {
+	if s.llamaModel == nil {
+		return "", fmt.Errorf("no tokenizer configured")
+	}
+
 	var resp string
 	for _, token := range tokens {
 		resp += s.llamaModel.TokenToPiece(token)
 	}
+
 	return resp, nil
 }
-	if s.textProcessor != nil {
+
+func (s *ollamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
 	toks := make([]int32, len(tokens))
 	for i, t := range tokens {
 		toks[i] = int32(t)
 	}
+
 	content, err := s.textProcessor.Decode(toks)
 	if err != nil {
 		return "", err
 	}
+
 	return content, nil
 }
-	// not reached
-	return "", fmt.Errorf("no tokenizer configured")
-}

 func (s *llmServer) Close() error {
 	s.llamaModelLock.Lock()