mirror of
https://github.com/ollama/ollama.git
synced 2025-12-08 15:31:25 +01:00
llm: Separate llamaServer and ollamaServer code paths
Originally, llamaServer represented old memory estimates, which could be used with either the old or new engine. ollamaServer was used only for the new estimates and new engine. Since these implementations did not map directly to engine, there was engine- specific code in common code paths. Now that new estimates are always used for the new engine, there is a direct mapping between server type and engine. This separates out most of the engine-specific code into the correct implementation to make things easier to understand.
This commit is contained in:
@@ -89,7 +89,6 @@ type llmServer struct {
|
|||||||
done chan error // Channel to signal when the process exits
|
done chan error // Channel to signal when the process exits
|
||||||
status *StatusWriter
|
status *StatusWriter
|
||||||
options api.Options
|
options api.Options
|
||||||
numParallel int
|
|
||||||
modelPath string
|
modelPath string
|
||||||
|
|
||||||
loadRequest LoadRequest // Parameters used to initialize the runner
|
loadRequest LoadRequest // Parameters used to initialize the runner
|
||||||
@@ -100,10 +99,6 @@ type llmServer struct {
|
|||||||
llamaModel *llama.Model
|
llamaModel *llama.Model
|
||||||
llamaModelLock *sync.Mutex
|
llamaModelLock *sync.Mutex
|
||||||
|
|
||||||
// textProcessor handles text encoding/decoding for the model in the Ollama engine
|
|
||||||
// nil if this server is running the llama.cpp based engine
|
|
||||||
textProcessor model.TextProcessor
|
|
||||||
|
|
||||||
totalLayers uint64
|
totalLayers uint64
|
||||||
loadStart time.Time // Record how long it took the model to load
|
loadStart time.Time // Record how long it took the model to load
|
||||||
loadProgress float32
|
loadProgress float32
|
||||||
@@ -119,6 +114,8 @@ type llamaServer struct {
|
|||||||
|
|
||||||
type ollamaServer struct {
|
type ollamaServer struct {
|
||||||
llmServer
|
llmServer
|
||||||
|
|
||||||
|
textProcessor model.TextProcessor // textProcessor handles text encoding/decoding
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadModel will load a model from disk. The model must be in the GGML format.
|
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||||
@@ -242,8 +239,6 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||||||
loadRequest: loadRequest,
|
loadRequest: loadRequest,
|
||||||
llamaModel: llamaModel,
|
llamaModel: llamaModel,
|
||||||
llamaModelLock: &sync.Mutex{},
|
llamaModelLock: &sync.Mutex{},
|
||||||
textProcessor: textProcessor,
|
|
||||||
numParallel: numParallel,
|
|
||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||||
totalLayers: f.KV().BlockCount() + 1,
|
totalLayers: f.KV().BlockCount() + 1,
|
||||||
loadStart: time.Now(),
|
loadStart: time.Now(),
|
||||||
@@ -278,7 +273,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
if textProcessor != nil {
|
if textProcessor != nil {
|
||||||
return &ollamaServer{llmServer: s}, nil
|
return &ollamaServer{llmServer: s, textProcessor: textProcessor}, nil
|
||||||
} else {
|
} else {
|
||||||
return &llamaServer{llmServer: s, ggml: f}, nil
|
return &llamaServer{llmServer: s, ggml: f}, nil
|
||||||
}
|
}
|
||||||
@@ -1681,69 +1676,60 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
|
|||||||
return e.Embedding, nil
|
return e.Embedding, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type TokenizeRequest struct {
|
func (s *llamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||||
Content string `json:"content"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type TokenizeResponse struct {
|
|
||||||
Tokens []int `json:"tokens"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
|
||||||
s.llamaModelLock.Lock()
|
s.llamaModelLock.Lock()
|
||||||
defer s.llamaModelLock.Unlock()
|
defer s.llamaModelLock.Unlock()
|
||||||
|
|
||||||
if s.llamaModel != nil {
|
if s.llamaModel == nil {
|
||||||
|
return nil, fmt.Errorf("no tokenizer configured")
|
||||||
|
}
|
||||||
|
|
||||||
return s.llamaModel.Tokenize(content, false, true)
|
return s.llamaModel.Tokenize(content, false, true)
|
||||||
}
|
}
|
||||||
if s.textProcessor != nil {
|
|
||||||
|
func (s *ollamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||||
tokens, err := s.textProcessor.Encode(content, false)
|
tokens, err := s.textProcessor.Encode(content, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
toks := make([]int, len(tokens))
|
toks := make([]int, len(tokens))
|
||||||
for i, t := range tokens {
|
for i, t := range tokens {
|
||||||
toks[i] = int(t)
|
toks[i] = int(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
return toks, nil
|
return toks, nil
|
||||||
}
|
}
|
||||||
// not reached
|
|
||||||
return nil, fmt.Errorf("no tokenizer configured")
|
|
||||||
}
|
|
||||||
|
|
||||||
type DetokenizeRequest struct {
|
func (s *llamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||||
Tokens []int `json:"tokens"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type DetokenizeResponse struct {
|
|
||||||
Content string `json:"content"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
|
||||||
s.llamaModelLock.Lock()
|
s.llamaModelLock.Lock()
|
||||||
defer s.llamaModelLock.Unlock()
|
defer s.llamaModelLock.Unlock()
|
||||||
|
|
||||||
if s.llamaModel != nil {
|
if s.llamaModel == nil {
|
||||||
|
return "", fmt.Errorf("no tokenizer configured")
|
||||||
|
}
|
||||||
|
|
||||||
var resp string
|
var resp string
|
||||||
for _, token := range tokens {
|
for _, token := range tokens {
|
||||||
resp += s.llamaModel.TokenToPiece(token)
|
resp += s.llamaModel.TokenToPiece(token)
|
||||||
}
|
}
|
||||||
|
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
if s.textProcessor != nil {
|
|
||||||
|
func (s *ollamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||||
toks := make([]int32, len(tokens))
|
toks := make([]int32, len(tokens))
|
||||||
for i, t := range tokens {
|
for i, t := range tokens {
|
||||||
toks[i] = int32(t)
|
toks[i] = int32(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
content, err := s.textProcessor.Decode(toks)
|
content, err := s.textProcessor.Decode(toks)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return content, nil
|
return content, nil
|
||||||
}
|
}
|
||||||
// not reached
|
|
||||||
return "", fmt.Errorf("no tokenizer configured")
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *llmServer) Close() error {
|
func (s *llmServer) Close() error {
|
||||||
s.llamaModelLock.Lock()
|
s.llamaModelLock.Lock()
|
||||||
|
|||||||
Reference in New Issue
Block a user