From b13fbad0fe663e1493e8ed4b04ed553f861abba9 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 6 Nov 2025 10:31:08 -0800 Subject: [PATCH] llm: Separate llamaServer and ollamaServer code paths Originally, llamaServer represented old memory estimates, which could be used with either the old or new engine. ollamaServer was used only for the new estimates and new engine. Since these implementations did not map directly to engine, there was engine- specific code in common code paths. Now that new estimates are always used for the new engine, there is a direct mapping between server type and engine. This separates out most of the engine-specific code into the correct implementation to make things easier to understand. --- llm/server.go | 114 ++++++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 64 deletions(-) diff --git a/llm/server.go b/llm/server.go index 83690fcdc1..c4b84950e2 100644 --- a/llm/server.go +++ b/llm/server.go @@ -84,13 +84,12 @@ type LlamaServer interface { // llmServer is an instance of a runner hosting a single model type llmServer struct { - port int - cmd *exec.Cmd - done chan error // Channel to signal when the process exits - status *StatusWriter - options api.Options - numParallel int - modelPath string + port int + cmd *exec.Cmd + done chan error // Channel to signal when the process exits + status *StatusWriter + options api.Options + modelPath string loadRequest LoadRequest // Parameters used to initialize the runner mem *ml.BackendMemory // Memory allocations for this model @@ -100,10 +99,6 @@ type llmServer struct { llamaModel *llama.Model llamaModelLock *sync.Mutex - // textProcessor handles text encoding/decoding for the model in the Ollama engine - // nil if this server is running the llama.cpp based engine - textProcessor model.TextProcessor - totalLayers uint64 loadStart time.Time // Record how long it took the model to load loadProgress float32 @@ -119,6 +114,8 @@ type llamaServer struct { type ollamaServer struct { llmServer + + textProcessor model.TextProcessor // textProcessor handles text encoding/decoding } // LoadModel will load a model from disk. The model must be in the GGML format. @@ -242,8 +239,6 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st loadRequest: loadRequest, llamaModel: llamaModel, llamaModelLock: &sync.Mutex{}, - textProcessor: textProcessor, - numParallel: numParallel, sem: semaphore.NewWeighted(int64(numParallel)), totalLayers: f.KV().BlockCount() + 1, loadStart: time.Now(), @@ -278,7 +273,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st }() if textProcessor != nil { - return &ollamaServer{llmServer: s}, nil + return &ollamaServer{llmServer: s, textProcessor: textProcessor}, nil } else { return &llamaServer{llmServer: s, ggml: f}, nil } @@ -1681,68 +1676,59 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err return e.Embedding, nil } -type TokenizeRequest struct { - Content string `json:"content"` -} - -type TokenizeResponse struct { - Tokens []int `json:"tokens"` -} - -func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) { +func (s *llamaServer) Tokenize(ctx context.Context, content string) ([]int, error) { s.llamaModelLock.Lock() defer s.llamaModelLock.Unlock() - if s.llamaModel != nil { - return s.llamaModel.Tokenize(content, false, true) + if s.llamaModel == nil { + return nil, fmt.Errorf("no tokenizer configured") } - if s.textProcessor != nil { - tokens, err := s.textProcessor.Encode(content, false) - if err != nil { - return nil, err - } - toks := make([]int, len(tokens)) - for i, t := range tokens { - toks[i] = int(t) - } - return toks, nil + + return s.llamaModel.Tokenize(content, false, true) +} + +func (s *ollamaServer) Tokenize(ctx context.Context, content string) ([]int, error) { + tokens, err := s.textProcessor.Encode(content, false) + if err != nil { + return nil, err } - // not reached - return nil, fmt.Errorf("no tokenizer configured") + + toks := make([]int, len(tokens)) + for i, t := range tokens { + toks[i] = int(t) + } + + return toks, nil } -type DetokenizeRequest struct { - Tokens []int `json:"tokens"` -} - -type DetokenizeResponse struct { - Content string `json:"content"` -} - -func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) { +func (s *llamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) { s.llamaModelLock.Lock() defer s.llamaModelLock.Unlock() - if s.llamaModel != nil { - var resp string - for _, token := range tokens { - resp += s.llamaModel.TokenToPiece(token) - } - return resp, nil + if s.llamaModel == nil { + return "", fmt.Errorf("no tokenizer configured") } - if s.textProcessor != nil { - toks := make([]int32, len(tokens)) - for i, t := range tokens { - toks[i] = int32(t) - } - content, err := s.textProcessor.Decode(toks) - if err != nil { - return "", err - } - return content, nil + + var resp string + for _, token := range tokens { + resp += s.llamaModel.TokenToPiece(token) } - // not reached - return "", fmt.Errorf("no tokenizer configured") + + return resp, nil +} + +func (s *ollamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) { + toks := make([]int32, len(tokens)) + for i, t := range tokens { + toks[i] = int32(t) + } + + content, err := s.textProcessor.Decode(toks) + if err != nil { + return "", err + } + + return content, nil } func (s *llmServer) Close() error {