mirror of
https://github.com/ollama/ollama.git
synced 2025-03-26 09:42:10 +01:00
feat: add new Ollama engine using ggml through cgo This change introduces a new way to run pretrained models. It introduces 3 high level interfaces and a bunch of smaller helper interfaces to facilitate this. - `model.Model` defines the interface for a model architecture. Models such as `llama` and `mllama`, which are provided as examples, can implement the model's forward propagation in the `Forward` method. This method will be called to generate completions. This interface can be found in `model/model.go` - `ml.Backend` defines the interface for a backend tensor library, in this case `ggml`. Among other things, a Backend is responsible for loading a pretrained model into hardware (GPU, CPU, etc) and providing an interface for Models to access loaded tensors. This interface can be found in `ml/backend.go` - `ml.Tensor` defines the interface for a tensor and tensor operations This is the first implementation of the new engine. Follow up PRs will implement more features: - non-greedy sampling (#8410) - integration with Ollama and KV caching (#8301) - more model support (#9080) with more coming soon Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
95 lines
2.2 KiB
Go
95 lines
2.2 KiB
Go
package convert
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"slices"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
type mixtralModel struct {
|
|
llamaModel
|
|
NumLocalExperts uint32 `json:"num_local_experts"`
|
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
|
}
|
|
|
|
func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
|
|
kv := p.llamaModel.KV(t)
|
|
|
|
if p.NumLocalExperts > 0 {
|
|
kv["llama.expert_count"] = p.NumLocalExperts
|
|
}
|
|
|
|
if p.NumExpertsPerToken > 0 {
|
|
kv["llama.expert_used_count"] = p.NumExpertsPerToken
|
|
}
|
|
|
|
return kv
|
|
}
|
|
|
|
func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
|
|
oldnew := []string{
|
|
"model.layers", "blk",
|
|
"w1", "ffn_gate_exps",
|
|
"w2", "ffn_down_exps",
|
|
"w3", "ffn_up_exps",
|
|
}
|
|
|
|
for i := range p.NumLocalExperts {
|
|
oldnew = append(oldnew, fmt.Sprintf(".block_sparse_moe.experts.%d.", i), ".")
|
|
}
|
|
|
|
// group experts of the same layer (model.layers.%d) and type (w[123]) into a single tensor
|
|
namer := strings.NewReplacer(oldnew...)
|
|
experts := make(map[string]experts)
|
|
|
|
// merge experts into a single tensor while removing them from ts
|
|
ts = slices.DeleteFunc(ts, func(t Tensor) bool {
|
|
if !strings.Contains(t.Name(), ".block_sparse_moe.experts.") {
|
|
return false
|
|
}
|
|
|
|
name := namer.Replace(t.Name())
|
|
experts[name] = append(experts[name], t)
|
|
return true
|
|
})
|
|
|
|
var out []ggml.Tensor
|
|
for n, e := range experts {
|
|
// TODO(mxyng): sanity check experts
|
|
out = append(out, ggml.Tensor{
|
|
Name: n,
|
|
Kind: e[0].Kind(),
|
|
Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...),
|
|
WriterTo: e,
|
|
})
|
|
}
|
|
|
|
return append(out, p.llamaModel.Tensors(ts)...)
|
|
}
|
|
|
|
func (p *mixtralModel) Replacements() []string {
|
|
return append(
|
|
p.llamaModel.Replacements(),
|
|
"block_sparse_moe.gate", "ffn_gate_inp",
|
|
)
|
|
}
|
|
|
|
type experts []Tensor
|
|
|
|
func (e experts) WriteTo(w io.Writer) (int64, error) {
|
|
// TODO(mxyng): experts _should_ be numerically sorted by expert but this should check
|
|
for _, t := range e {
|
|
// the canonical merged experts tensor stacks all experts along a new, 0 axis,
|
|
// e.g. `tensor.Stack(0, e[0], e[1:]...)`, which requires allocating temporary buffers
|
|
// this accomplishes the same thing by writing each expert tensor in sequence
|
|
if _, err := t.WriteTo(w); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
return 0, nil
|
|
}
|