diff --git a/convert/convert.go b/convert/convert.go index 44783b6e8..ed95bb11c 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -9,7 +9,7 @@ import ( "log/slog" "strings" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type ModelParameters struct { @@ -27,8 +27,8 @@ type AdapterParameters struct { } `json:"lora_parameters"` } -func (ModelParameters) KV(t *Tokenizer) llm.KV { - kv := llm.KV{ +func (ModelParameters) KV(t *Tokenizer) ggml.KV { + kv := ggml.KV{ "general.file_type": uint32(1), "general.quantization_version": uint32(2), "tokenizer.ggml.pre": t.Pre, @@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV { return kv } -func (p AdapterParameters) KV() llm.KV { +func (p AdapterParameters) KV() ggml.KV { var alpha float32 if p.LoraParameters.Alpha == 0 { alpha = float32(p.Alpha) @@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV { alpha = p.LoraParameters.Alpha } - kv := llm.KV{ + kv := ggml.KV{ "adapter.lora.alpha": alpha, "adapter.type": "lora", "general.file_type": uint32(1), @@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string { } } -func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { - return llm.WriteGGUF(ws, kv, ts) +func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error { + return ggml.WriteGGUF(ws, kv, ts) } -func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { - return llm.WriteGGUF(ws, kv, ts) +func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error { + return ggml.WriteGGUF(ws, kv, ts) } type ModelConverter interface { // KV maps parameters to LLM key-values - KV(*Tokenizer) llm.KV + KV(*Tokenizer) ggml.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. - Tensors([]Tensor) []llm.Tensor + Tensors([]Tensor) []ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string @@ -99,7 +99,7 @@ type ModelConverter interface { // specialTokenTypes returns any special token types the model uses specialTokenTypes() []string // writeFile writes the model to the provided io.WriteSeeker - writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error + writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error } type moreParser interface { @@ -108,17 +108,17 @@ type moreParser interface { type AdapterConverter interface { // KV maps parameters to LLM key-values - KV(llm.KV) llm.KV + KV(ggml.KV) ggml.KV // Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here. - Tensors([]Tensor) []llm.Tensor + Tensors([]Tensor) []ggml.Tensor // Replacements returns a list of string pairs to replace in tensor names. // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details Replacements() []string - writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error + writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error } -func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error { +func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error { bts, err := fs.ReadFile(fsys, "adapter_config.json") if err != nil { return err diff --git a/convert/convert_bert.go b/convert/convert_bert.go index ea5facaa5..8575652aa 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -8,7 +8,7 @@ import ( "slices" "strings" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type bertModel struct { @@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error { return nil } -func (p *bertModel) KV(t *Tokenizer) llm.KV { +func (p *bertModel) KV(t *Tokenizer) ggml.KV { kv := p.ModelParameters.KV(t) kv["general.architecture"] = "bert" kv["bert.attention.causal"] = false @@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV { return kv } -func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor +func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor { + var out []ggml.Tensor for _, t := range ts { if slices.Contains([]string{ "embeddings.position_ids", @@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor { continue } - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index b88652947..6c04145ff 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -6,7 +6,7 @@ import ( "github.com/pdevine/tensor" "github.com/pdevine/tensor/native" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type gemmaModel struct { @@ -23,7 +23,7 @@ type gemmaModel struct { var _ ModelConverter = (*gemmaModel)(nil) -func (p *gemmaModel) KV(t *Tokenizer) llm.KV { +func (p *gemmaModel) KV(t *Tokenizer) ggml.KV { kv := p.ModelParameters.KV(t) kv["general.architecture"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings @@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV { return kv } -func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor +func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor { + var out []ggml.Tensor for _, t := range ts { if strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go index 0f98c1e38..4917e42cd 100644 --- a/convert/convert_gemma2.go +++ b/convert/convert_gemma2.go @@ -1,8 +1,6 @@ package convert -import ( - "github.com/ollama/ollama/llm" -) +import "github.com/ollama/ollama/fs/ggml" type gemma2Model struct { gemmaModel @@ -11,7 +9,7 @@ type gemma2Model struct { FinalLogitSoftcap float32 `json:"final_logit_softcapping"` } -func (p *gemma2Model) KV(t *Tokenizer) llm.KV { +func (p *gemma2Model) KV(t *Tokenizer) ggml.KV { kv := p.ModelParameters.KV(t) kv["general.architecture"] = "gemma2" kv["gemma2.context_length"] = p.MaxPositionEmbeddings diff --git a/convert/convert_gemma2_adapter.go b/convert/convert_gemma2_adapter.go index a89a25f4c..3494aa3f9 100644 --- a/convert/convert_gemma2_adapter.go +++ b/convert/convert_gemma2_adapter.go @@ -6,7 +6,7 @@ import ( "github.com/pdevine/tensor" "github.com/pdevine/tensor/native" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type gemma2Adapter struct { @@ -15,14 +15,14 @@ type gemma2Adapter struct { var _ AdapterConverter = (*gemma2Adapter)(nil) -func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV { +func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV { kv := p.AdapterParameters.KV() kv["general.architecture"] = "gemma2" return kv } -func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor +func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor { + var out []ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor { t.SetRepacker(p.repack) } - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 5dedb829d..e4422f41a 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -9,7 +9,7 @@ import ( "github.com/pdevine/tensor" "github.com/pdevine/tensor/native" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type llamaModel struct { @@ -46,7 +46,7 @@ type llamaModel struct { var _ ModelConverter = (*llamaModel)(nil) -func (p *llamaModel) KV(t *Tokenizer) llm.KV { +func (p *llamaModel) KV(t *Tokenizer) ggml.KV { kv := p.ModelParameters.KV(t) kv["general.architecture"] = "llama" kv["llama.vocab_size"] = p.VocabSize @@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV { return kv } -func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor +func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor { + var out []ggml.Tensor if p.RopeScaling.factors != nil { - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: "rope_freqs.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.factors))}, @@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor { t.SetRepacker(p.repack) } - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_llama_adapter.go b/convert/convert_llama_adapter.go index 08ddee10a..718ef047e 100644 --- a/convert/convert_llama_adapter.go +++ b/convert/convert_llama_adapter.go @@ -7,7 +7,7 @@ import ( "github.com/pdevine/tensor" "github.com/pdevine/tensor/native" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type llamaAdapter struct { @@ -18,7 +18,7 @@ type llamaAdapter struct { var _ AdapterConverter = (*llamaAdapter)(nil) -func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV { +func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV { kv := p.AdapterParameters.KV() kv["general.architecture"] = "llama" kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"] @@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV { return kv } -func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor +func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor { + var out []ggml.Tensor for _, t := range ts { shape := t.Shape() if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || @@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor { t.SetRepacker(p.repack) } - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: shape, diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 43b7c8b10..95a289f76 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -6,7 +6,7 @@ import ( "slices" "strings" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type mixtralModel struct { @@ -15,7 +15,7 @@ type mixtralModel struct { NumExpertsPerToken uint32 `json:"num_experts_per_tok"` } -func (p *mixtralModel) KV(t *Tokenizer) llm.KV { +func (p *mixtralModel) KV(t *Tokenizer) ggml.KV { kv := p.llamaModel.KV(t) if p.NumLocalExperts > 0 { @@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV { return kv } -func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor { +func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor { oldnew := []string{ "model.layers", "blk", "w1", "ffn_gate_exps", @@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor { return true }) - var out []llm.Tensor + var out []ggml.Tensor for n, e := range experts { // TODO(mxyng): sanity check experts - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: n, Kind: e[0].Kind(), Shape: append([]uint64{uint64(len(e))}, e[0].Shape()...), diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 3de0d4049..4f25737b1 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -8,7 +8,7 @@ import ( "strings" "sync" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type phi3Model struct { @@ -37,7 +37,7 @@ type phi3Model struct { var _ ModelConverter = (*phi3Model)(nil) -func (p *phi3Model) KV(t *Tokenizer) llm.KV { +func (p *phi3Model) KV(t *Tokenizer) ggml.KV { kv := p.ModelParameters.KV(t) kv["general.architecture"] = "phi3" kv["phi3.context_length"] = p.MaxPositionEmbeddings @@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV { return kv } -func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor { +func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor { var addRopeFactors sync.Once - out := make([]llm.Tensor, 0, len(ts)+2) + out := make([]ggml.Tensor, 0, len(ts)+2) for _, t := range ts { if strings.HasPrefix(t.Name(), "blk.0.") { addRopeFactors.Do(func() { - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: "rope_factors_long.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.LongFactor))}, WriterTo: p.RopeScaling.LongFactor, - }, llm.Tensor{ + }, ggml.Tensor{ Name: "rope_factors_short.weight", Kind: 0, Shape: []uint64{uint64(len(p.RopeScaling.ShortFactor))}, @@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor { }) } - out = append(out, llm.Tensor{ + out = append(out, ggml.Tensor{ Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), diff --git a/convert/convert_test.go b/convert/convert_test.go index 48a2b1d45..81ba9c9ea 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -20,7 +20,7 @@ import ( "golang.org/x/exp/maps" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) type tensorData struct { @@ -29,7 +29,7 @@ type tensorData struct { Shape []int `json:"shape"` } -func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) { +func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) { t.Helper() f, err := os.CreateTemp(t.TempDir(), "f16") @@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) { } t.Cleanup(func() { r.Close() }) - m, _, err := llm.DecodeGGML(r, math.MaxInt) + m, _, err := ggml.Decode(r, math.MaxInt) if err != nil { t.Fatal(err) } @@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) { return r, m.KV(), m.Tensors() } -func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string { +func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string { actual := make(map[string]string) for k, v := range kv { if s, ok := v.(json.Marshaler); !ok { @@ -330,7 +330,7 @@ func TestConvertAdapter(t *testing.T) { } defer r.Close() - m, _, err := llm.DecodeGGML(r, math.MaxInt) + m, _, err := ggml.Decode(r, math.MaxInt) if err != nil { t.Fatal(err) } diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 5bb799206..781fb118e 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -1,12 +1,12 @@ package ggml import ( - "cmp" "encoding/binary" "errors" "fmt" "io" "log/slog" + "slices" "strings" "github.com/ollama/ollama/fs/util/bufioutil" @@ -25,7 +25,15 @@ type model interface { type KV map[string]any func (kv KV) Architecture() string { - return cmp.Or(kv.String("general.architecture"), "unknown") + return kv.String("general.architecture", "unknown") +} + +func (kv KV) Kind() string { + return kv.String("general.kind", "unknown") +} + +func (kv KV) ParameterCount() uint64 { + return keyValue[uint64](kv, "general.parameter_count") } func (kv KV) FileType() fileType { @@ -36,6 +44,50 @@ func (kv KV) FileType() fileType { return fileTypeUnknown } +func (kv KV) BlockCount() uint64 { + return uint64(kv.Uint("block_count")) +} + +func (kv KV) EmbeddingLength() uint64 { + return uint64(kv.Uint("embedding_length")) +} + +func (kv KV) HeadCount() uint64 { + return uint64(kv.Uint("attention.head_count")) +} + +func (kv KV) HeadCountKV() uint64 { + return uint64(kv.Uint("attention.head_count_kv", 1)) +} + +func (kv KV) EmbeddingHeadCount() uint64 { + if heads := kv.HeadCount(); heads > 0 { + return kv.EmbeddingLength() / heads + } + + return 0 +} + +func (kv KV) EmbeddingHeadCountK() uint64 { + return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount()))) +} + +func (kv KV) EmbeddingHeadCountV() uint64 { + return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount()))) +} + +func (kv KV) GQA() uint64 { + return kv.HeadCount() / kv.HeadCountKV() +} + +func (kv KV) ContextLength() uint64 { + return uint64(kv.Uint("context_length")) +} + +func (kv KV) ChatTemplate() string { + return kv.String("tokenizer.chat_template") +} + func (kv KV) String(key string, defaultValue ...string) string { return keyValue(kv, key, append(defaultValue, "")...) } @@ -68,7 +120,7 @@ func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 { return s } -func keyValue[T string | uint32 | float32 | *array](kv KV, key string, defaultValue ...T) T { +func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T { if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") { key = kv.Architecture() + "." + key } @@ -107,7 +159,7 @@ func (ts Tensors) Layers() map[string]Layer { type Layer map[string]*Tensor -func (l Layer) size() (size uint64) { +func (l Layer) Size() (size uint64) { for _, t := range l { size += t.Size() } @@ -243,7 +295,7 @@ const ( var ErrUnsupportedFormat = errors.New("unsupported model format") -func DetectGGMLType(b []byte) string { +func DetectContentType(b []byte) string { switch binary.LittleEndian.Uint32(b[:4]) { case FILE_MAGIC_GGML: return "ggml" @@ -260,12 +312,12 @@ func DetectGGMLType(b []byte) string { } } -// DecodeGGML decodes a GGML model from the given reader. +// Decode decodes a GGML model from the given reader. // // It collects array values for arrays with a size less than or equal to // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If // the maxArraySize is negative, all arrays are collected. -func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { +func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { if maxArraySize == 0 { maxArraySize = 1024 } @@ -303,3 +355,202 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { model: model, }, offset, nil } + +func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) { + embedding := llm.KV().EmbeddingLength() + heads := llm.KV().HeadCount() + headsKV := llm.KV().HeadCountKV() + vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size) + + embeddingHeads := llm.KV().EmbeddingHeadCount() + embeddingHeadsK := llm.KV().EmbeddingHeadCountK() + embeddingHeadsV := llm.KV().EmbeddingHeadCountV() + + layers := llm.Tensors().Layers() + + bytesPerElement := kvCacheBytesPerElement(kvCacheType) + kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) + + switch llm.KV().Architecture() { + case "llama": + fullOffload = max( + 4*batch*(1+4*embedding+context*(1+heads)), + 4*batch*(embedding+vocab), + ) + + partialOffload = 4 * batch * embedding + partialOffload += max( + 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV), + 4*batch*(embedding+vocab)+embedding*vocab*105/128, + ) + + if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok { + // mixtral 8x22b + ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32)) + partialOffload = max( + 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV), + 4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch), + ) + } else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok { + // mixtral 8x7b + ffnGateWeight1 := ffnGateWeight.Shape[1] + fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1) + partialOffload = max( + 4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16, + 4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16), + ) + } + case "mllama": + var visionTokens, tiles uint64 = 1601, 4 + + if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok { + kv = headsKV * + (embeddingHeadsK + embeddingHeadsV) * // one for K, one for V + (2* // sizeof(float16) + (llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers + context + + 4* // sizeof(float32) + uint64(crossAttentionLayers.size)* // num cross attention layers + visionTokens* + tiles) + } + + fullOffload = max( + 4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)), + // vocab graph + 4*batch*(embedding+vocab), + ) + + var ropeFreqsCount uint64 + if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok { + if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { + ropeFreqsCount = ropeFreqsWeights.parameters() + } + } + + partialOffload = max( + 4*(batch* + (2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+ + ropeFreqsCount+ + embeddingHeadsK*context*headsKV), + // vocab graph + 4*batch*(embedding+vocab)+embedding*vocab*105/128, + ) + case "gemma", "gemma2": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads), + ) + + partialOffload = max( + 4*embedding*batch+embedding*vocab*105/128+4*vocab*batch, + 4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+ + 4*embeddingHeadsK*context*8+ + embedding*embeddingHeadsK*heads*9/16, + ) + case "command-r": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(2+4*embedding+context*(1+heads)), + ) + + partialOffload = max( + 4*batch*(embedding+vocab)+embedding*vocab*105/128, + 4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16, + ) + case "qwen2": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(1+2*embedding+context+context*heads), + ) + + partialOffload = max( + 4*batch*(embedding+vocab)+embedding*vocab*105/128, + 4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)), + ) + case "phi2": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(1+4*embedding+context+context*heads), + ) + + partialOffload = max( + 4*batch*(2*embedding+vocab)+embedding*vocab*105/128, + 4*batch*(2+3*embedding+context+context*heads), + ) + case "stablelm": + fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2) + partialOffload = max( + 4*batch*(vocab+2*embedding), + fullOffload, + ) + case "deepseek2": + fullOffload = max( + 4*batch*(3*embedding+vocab), + 4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV), + ) + + partialOffload = max( + 4*batch*(3*embedding+vocab)+embedding*vocab*105/128, + 4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16, + ) + case "chatglm": + fullOffload = 4 * batch * (embedding + vocab) + partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128 + if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok { + fullOffload = max( + fullOffload, + 4*batch*(2+ + 2*embedding+ + context+ + context*heads+ + embeddingHeadsK*heads+ + qkvBias.Shape[0]), + ) + + partialOffload = max( + partialOffload, + 4*batch*(1+ + 2*embedding+ + embeddingHeadsK*heads+ + context+ + context*heads)+ + 4*embeddingHeadsK*context+ + 4*context*embeddingHeadsK+ + 4*qkvBias.Shape[0], + ) + } + } + + return +} + +// SupportsKVCacheType checks if the requested cache type is supported +func (llm GGML) SupportsKVCacheType(cacheType string) bool { + return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) +} + +// SupportsFlashAttention checks if the model supports flash attention +func (llm GGML) SupportsFlashAttention() bool { + _, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())] + if isEmbedding { + return false + } + + // Check head counts match and are non-zero + headCountK := llm.KV().EmbeddingHeadCountK() + headCountV := llm.KV().EmbeddingHeadCountV() + return headCountK != 0 && headCountV != 0 && headCountK == headCountV +} + +// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type +func kvCacheBytesPerElement(cacheType string) float64 { + switch cacheType { + case "q8_0": + return 1 // 1/2 of fp16 + case "q4_0": + return 0.5 // 1/4 of fp16 + default: + return 2 // f16 (default) + } +} diff --git a/llm/filetype.go b/llm/filetype.go deleted file mode 100644 index 10f3d670f..000000000 --- a/llm/filetype.go +++ /dev/null @@ -1,185 +0,0 @@ -package llm - -import "fmt" - -type fileType uint32 - -const ( - fileTypeF32 fileType = iota - fileTypeF16 - fileTypeQ4_0 - fileTypeQ4_1 - fileTypeQ4_1_F16 - fileTypeQ4_2 // unused - fileTypeQ4_3 // unused - fileTypeQ8_0 - fileTypeQ5_0 - fileTypeQ5_1 - fileTypeQ2_K - fileTypeQ3_K_S - fileTypeQ3_K_M - fileTypeQ3_K_L - fileTypeQ4_K_S - fileTypeQ4_K_M - fileTypeQ5_K_S - fileTypeQ5_K_M - fileTypeQ6_K - fileTypeIQ2_XXS - fileTypeIQ2_XS - fileTypeQ2_K_S - fileTypeIQ3_XS - fileTypeIQ3_XXS - fileTypeIQ1_S - fileTypeIQ4_NL - fileTypeIQ3_S - fileTypeIQ3_M - fileTypeIQ2_S - fileTypeIQ2_M - fileTypeIQ4_XS - fileTypeIQ1_M - fileTypeBF16 - - fileTypeUnknown -) - -func ParseFileType(s string) (fileType, error) { - switch s { - case "F32": - return fileTypeF32, nil - case "F16": - return fileTypeF16, nil - case "Q4_0": - return fileTypeQ4_0, nil - case "Q4_1": - return fileTypeQ4_1, nil - case "Q4_1_F16": - return fileTypeQ4_1_F16, nil - case "Q8_0": - return fileTypeQ8_0, nil - case "Q5_0": - return fileTypeQ5_0, nil - case "Q5_1": - return fileTypeQ5_1, nil - case "Q2_K": - return fileTypeQ2_K, nil - case "Q3_K_S": - return fileTypeQ3_K_S, nil - case "Q3_K_M": - return fileTypeQ3_K_M, nil - case "Q3_K_L": - return fileTypeQ3_K_L, nil - case "Q4_K_S": - return fileTypeQ4_K_S, nil - case "Q4_K_M": - return fileTypeQ4_K_M, nil - case "Q5_K_S": - return fileTypeQ5_K_S, nil - case "Q5_K_M": - return fileTypeQ5_K_M, nil - case "Q6_K": - return fileTypeQ6_K, nil - case "IQ2_XXS": - return fileTypeIQ2_XXS, nil - case "IQ2_XS": - return fileTypeIQ2_XS, nil - case "Q2_K_S": - return fileTypeQ2_K_S, nil - case "IQ3_XS": - return fileTypeIQ3_XS, nil - case "IQ3_XXS": - return fileTypeIQ3_XXS, nil - case "IQ1_S": - return fileTypeIQ1_S, nil - case "IQ4_NL": - return fileTypeIQ4_NL, nil - case "IQ3_S": - return fileTypeIQ3_S, nil - case "IQ3_M": - return fileTypeIQ3_M, nil - case "IQ2_S": - return fileTypeIQ2_S, nil - case "IQ4_XS": - return fileTypeIQ4_XS, nil - case "IQ2_M": - return fileTypeIQ2_M, nil - case "IQ1_M": - return fileTypeIQ1_M, nil - case "BF16": - return fileTypeBF16, nil - default: - return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s) - } -} - -func (t fileType) String() string { - switch t { - case fileTypeF32: - return "F32" - case fileTypeF16: - return "F16" - case fileTypeQ4_0: - return "Q4_0" - case fileTypeQ4_1: - return "Q4_1" - case fileTypeQ4_1_F16: - return "Q4_1_F16" - case fileTypeQ8_0: - return "Q8_0" - case fileTypeQ5_0: - return "Q5_0" - case fileTypeQ5_1: - return "Q5_1" - case fileTypeQ2_K: - return "Q2_K" - case fileTypeQ3_K_S: - return "Q3_K_S" - case fileTypeQ3_K_M: - return "Q3_K_M" - case fileTypeQ3_K_L: - return "Q3_K_L" - case fileTypeQ4_K_S: - return "Q4_K_S" - case fileTypeQ4_K_M: - return "Q4_K_M" - case fileTypeQ5_K_S: - return "Q5_K_S" - case fileTypeQ5_K_M: - return "Q5_K_M" - case fileTypeQ6_K: - return "Q6_K" - case fileTypeIQ2_XXS: - return "IQ2_XXS" - case fileTypeIQ2_XS: - return "IQ2_XS" - case fileTypeQ2_K_S: - return "Q2_K_S" - case fileTypeIQ3_XS: - return "IQ3_XS" - case fileTypeIQ3_XXS: - return "IQ3_XXS" - case fileTypeIQ1_S: - return "IQ1_S" - case fileTypeIQ4_NL: - return "IQ4_NL" - case fileTypeIQ3_S: - return "IQ3_S" - case fileTypeIQ3_M: - return "IQ3_M" - case fileTypeIQ2_S: - return "IQ2_S" - case fileTypeIQ4_XS: - return "IQ4_XS" - case fileTypeIQ2_M: - return "IQ2_M" - case fileTypeIQ1_M: - return "IQ1_M" - case fileTypeBF16: - return "BF16" - default: - return "unknown" - } -} - -func (t fileType) Value() uint32 { - return uint32(t) -} diff --git a/llm/ggla.go b/llm/ggla.go deleted file mode 100644 index ec0a5941c..000000000 --- a/llm/ggla.go +++ /dev/null @@ -1,149 +0,0 @@ -package llm - -import ( - "encoding/binary" - "errors" - "io" - "slices" -) - -type containerGGLA struct { - version uint32 -} - -func (c *containerGGLA) Name() string { - return "ggla" -} - -func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) { - if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil { - return nil, err - } - - switch c.version { - case 1: - default: - return nil, errors.New("invalid version") - } - - model := newGGLA(c) - err := model.decode(rs) - return model, err -} - -type ggla struct { - *containerGGLA - - kv KV - tensors []*Tensor - - tensorOffset uint64 -} - -func newGGLA(container *containerGGLA) *ggla { - return &ggla{ - containerGGLA: container, - kv: make(KV), - } -} - -func (llm *ggla) KV() KV { - return llm.kv -} - -func (llm *ggla) Tensors() *Tensors { - return &Tensors{ - Items: llm.tensors, - Offset: llm.tensorOffset, - } -} - -func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) { - var r uint32 - if err := binary.Read(rs, binary.LittleEndian, &r); err != nil { - return err - } - llm.kv["r"] = r - - var alpha uint32 - if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil { - return err - } - llm.kv["alpha"] = alpha - - offset, err := rs.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - llm.tensorOffset = uint64(offset) - - for { - var dims uint32 - if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil { - if errors.Is(err, io.EOF) { - return nil - } - return err - } - - defer func() { - if errors.Is(retErr, io.EOF) { - retErr = io.ErrUnexpectedEOF - } - }() - - var namesize uint32 - if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil { - return err - } - - var t Tensor - if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil { - return err - } - - t.Shape = make([]uint64, dims) - for i := 0; uint32(i) < dims; i++ { - var shape32 uint32 - if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil { - return err - } - - t.Shape[i] = uint64(shape32) - } - - // ggla tensor shape is reversed - // ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44 - slices.Reverse(t.Shape) - - name := make([]byte, namesize) - if err := binary.Read(rs, binary.LittleEndian, &name); err != nil { - return err - } - - t.Name = string(name) - - offset, err := rs.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil { - return err - } - - offset, err = rs.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - t.Offset = uint64(offset) - - if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil { - return err - } - - llm.tensors = append(llm.tensors, &t) - } -} diff --git a/llm/ggml.go b/llm/ggml.go deleted file mode 100644 index a4cf6f501..000000000 --- a/llm/ggml.go +++ /dev/null @@ -1,561 +0,0 @@ -package llm - -import ( - "encoding/binary" - "errors" - "fmt" - "io" - "slices" - "strings" - "sync" - - "github.com/ollama/ollama/fs/util/bufioutil" -) - -type GGML struct { - container - model -} - -type model interface { - KV() KV - Tensors() *Tensors -} - -type KV map[string]any - -func (kv KV) u64(key string) uint64 { - switch v := kv[key].(type) { - case uint64: - return v - case uint32: - return uint64(v) - case float64: - return uint64(v) - default: - return 0 - } -} - -func (kv KV) Architecture() string { - if s, ok := kv["general.architecture"].(string); ok { - return s - } - - return "unknown" -} - -func (kv KV) Kind() string { - if s, ok := kv["general.type"].(string); ok { - return s - } - - return "unknown" -} - -func (kv KV) ParameterCount() uint64 { - return kv.u64("general.parameter_count") -} - -func (kv KV) FileType() fileType { - if u64 := kv.u64("general.file_type"); u64 > 0 { - return fileType(uint32(u64)) - } - - return fileTypeUnknown -} - -func (kv KV) BlockCount() uint64 { - return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture())) -} - -func (kv KV) HeadCount() uint64 { - return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture())) -} - -func (kv KV) HeadCountKV() uint64 { - if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 { - return headCountKV - } - - return 1 -} - -func (kv KV) EmbeddingHeadCount() uint64 { - if heads := kv.HeadCount(); heads > 0 { - return kv.EmbeddingLength() / kv.HeadCount() - } - - return 0 -} - -func (kv KV) EmbeddingHeadCountK() uint64 { - if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 { - return k - } - - return kv.EmbeddingHeadCount() -} - -func (kv KV) EmbeddingHeadCountV() uint64 { - if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 { - return v - } - - return kv.EmbeddingHeadCount() -} - -func (kv KV) GQA() uint64 { - return kv.HeadCount() / kv.HeadCountKV() -} - -func (kv KV) EmbeddingLength() uint64 { - return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture())) -} - -func (kv KV) ContextLength() uint64 { - return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture())) -} - -func (kv KV) ChatTemplate() string { - s, _ := kv["tokenizer.chat_template"].(string) - return s -} - -type Tensors struct { - Items []*Tensor - Offset uint64 - - layers map[string]Layer - layersOnce sync.Once -} - -func (ts *Tensors) Layers() map[string]Layer { - ts.layersOnce.Do(func() { - ts.layers = make(map[string]Layer) - for _, t := range ts.Items { - parts := strings.Split(t.Name, ".") - if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 { - if len(parts) > index+2 { - // blk and mm should have a number after them, join it - parts = append( - []string{strings.Join(parts[:index+2], ".")}, - parts[index+2:]...) - } - } - - if _, ok := ts.layers[parts[0]]; !ok { - ts.layers[parts[0]] = make(Layer) - } - - ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t - } - }) - - return ts.layers -} - -type Layer map[string]*Tensor - -func (l Layer) size() (size uint64) { - for _, t := range l { - size += t.Size() - } - - return size -} - -type Tensor struct { - Name string `json:"name"` - Kind uint32 `json:"kind"` - Offset uint64 `json:"-"` - - // Shape is the number of elements in each dimension - Shape []uint64 `json:"shape"` - - io.WriterTo `json:"-"` -} - -func (t Tensor) block() (n int) { - if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil { - return -1 - } - - return -} - -func (t Tensor) blockSize() uint64 { - switch t.Kind { - case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16 - return 1 - case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL - return 32 - default: // All others - return 256 - } -} - -func (t Tensor) typeSize() uint64 { - blockSize := t.blockSize() - - switch t.Kind { - case 0: // FP32 - return 4 - case 1: // FP16 - return 2 - case 2: // Q4_0 - return 2 + blockSize/2 - case 3: // Q4_1 - return 2 + 2 + blockSize/2 - case 6: // Q5_0 - return 2 + 4 + blockSize/2 - case 7: // Q5_1 - return 2 + 2 + 4 + blockSize/2 - case 8: // Q8_0 - return 2 + blockSize - case 9: // Q8_1 - return 4 + 4 + blockSize - case 10: // Q2_K - return blockSize/16 + blockSize/4 + 2 + 2 - case 11: // Q3_K - return blockSize/8 + blockSize/4 + 12 + 2 - case 12: // Q4_K - return 2 + 2 + 12 + blockSize/2 - case 13: // Q5_K - return 2 + 2 + 12 + blockSize/8 + blockSize/2 - case 14: // Q6_K - return blockSize/2 + blockSize/4 + blockSize/16 + 2 - case 15: // Q8_K - return 2 + blockSize + 2*blockSize/16 - case 16: // IQ2_XXS - return 2 + 2*blockSize/8 - case 17: // IQ2_XS - return 2 + 2*blockSize/8 + blockSize/32 - case 18: // IQ3_XXS - return 2 + blockSize/4 + blockSize/8 - case 19: // IQ1_S - return 2 + blockSize/8 + blockSize/16 - case 20: // IQ4_NL - return 2 + blockSize/2 - case 21: // IQ3_S - return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4 - case 22: // IQ2_S - return 2 + blockSize/4 + blockSize/16 - case 23: // IQ4_XS - return 2 + 2 + blockSize/2 + blockSize/64 - case 24: // I8 - return 1 - case 25: // I16 - return 2 - case 26: // I32 - return 4 - case 27: // I64 - return 8 - case 28: // F64 - return 8 - case 29: // IQ1_M - return blockSize/8 + blockSize/16 + blockSize/32 - case 30: // BF16 - return 2 - default: - return 0 - } -} - -func (t Tensor) parameters() uint64 { - var count uint64 = 1 - for _, n := range t.Shape { - count *= n - } - return count -} - -func (t Tensor) Size() uint64 { - return t.parameters() * t.typeSize() / t.blockSize() -} - -type container interface { - Name() string - Decode(io.ReadSeeker) (model, error) -} - -const ( - // Magic constant for `ggml` files (unversioned). - FILE_MAGIC_GGML = 0x67676d6c - // Magic constant for `ggml` files (versioned, ggmf). - FILE_MAGIC_GGMF = 0x67676d66 - // Magic constant for `ggml` files (versioned, ggjt). - FILE_MAGIC_GGJT = 0x67676a74 - // Magic constant for `ggla` files (LoRA adapter). - FILE_MAGIC_GGLA = 0x67676C61 - // Magic constant for `gguf` files (versioned, gguf) - FILE_MAGIC_GGUF_LE = 0x46554747 - FILE_MAGIC_GGUF_BE = 0x47475546 -) - -var ErrUnsupportedFormat = errors.New("unsupported model format") - -func DetectGGMLType(b []byte) string { - switch binary.LittleEndian.Uint32(b[:4]) { - case FILE_MAGIC_GGML: - return "ggml" - case FILE_MAGIC_GGMF: - return "ggmf" - case FILE_MAGIC_GGJT: - return "ggjt" - case FILE_MAGIC_GGLA: - return "ggla" - case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE: - return "gguf" - default: - return "" - } -} - -// DecodeGGML decodes a GGML model from the given reader. -// -// It collects array values for arrays with a size less than or equal to -// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If -// the maxArraySize is negative, all arrays are collected. -func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) { - if maxArraySize == 0 { - maxArraySize = 1024 - } - - rs = bufioutil.NewBufferedSeeker(rs, 32<<10) - - var magic uint32 - if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { - return nil, 0, err - } - - var c container - switch magic { - case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: - return nil, 0, ErrUnsupportedFormat - case FILE_MAGIC_GGLA: - c = &containerGGLA{} - case FILE_MAGIC_GGUF_LE: - c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize} - case FILE_MAGIC_GGUF_BE: - c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize} - default: - return nil, 0, errors.New("invalid file magic") - } - - model, err := c.Decode(rs) - if err != nil { - return nil, 0, err - } - - offset, err := rs.Seek(0, io.SeekCurrent) - if err != nil { - return nil, 0, err - } - - // final model type - return &GGML{ - container: c, - model: model, - }, offset, nil -} - -func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) { - embedding := llm.KV().EmbeddingLength() - heads := llm.KV().HeadCount() - headsKV := llm.KV().HeadCountKV() - vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size) - - embeddingHeads := llm.KV().EmbeddingHeadCount() - embeddingHeadsK := llm.KV().EmbeddingHeadCountK() - embeddingHeadsV := llm.KV().EmbeddingHeadCountV() - - layers := llm.Tensors().Layers() - - bytesPerElement := kvCacheBytesPerElement(kvCacheType) - kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) - - switch llm.KV().Architecture() { - case "llama": - fullOffload = max( - 4*batch*(1+4*embedding+context*(1+heads)), - 4*batch*(embedding+vocab), - ) - - partialOffload = 4 * batch * embedding - partialOffload += max( - 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV), - 4*batch*(embedding+vocab)+embedding*vocab*105/128, - ) - - if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok { - // mixtral 8x22b - ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32)) - partialOffload = max( - 3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV), - 4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch), - ) - } else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok { - // mixtral 8x7b - ffnGateWeight1 := ffnGateWeight.Shape[1] - fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1) - partialOffload = max( - 4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16, - 4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16), - ) - } - case "mllama": - var visionTokens, tiles uint64 = 1601, 4 - - if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok { - kv = headsKV * - (embeddingHeadsK + embeddingHeadsV) * // one for K, one for V - (2* // sizeof(float16) - (llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers - context + - 4* // sizeof(float32) - uint64(crossAttentionLayers.size)* // num cross attention layers - visionTokens* - tiles) - } - - fullOffload = max( - 4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)), - // vocab graph - 4*batch*(embedding+vocab), - ) - - var ropeFreqsCount uint64 - if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok { - if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok { - ropeFreqsCount = ropeFreqsWeights.parameters() - } - } - - partialOffload = max( - 4*(batch* - (2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+ - ropeFreqsCount+ - embeddingHeadsK*context*headsKV), - // vocab graph - 4*batch*(embedding+vocab)+embedding*vocab*105/128, - ) - case "gemma", "gemma2": - fullOffload = max( - 4*batch*(embedding+vocab), - 4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads), - ) - - partialOffload = max( - 4*embedding*batch+embedding*vocab*105/128+4*vocab*batch, - 4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+ - 4*embeddingHeadsK*context*8+ - embedding*embeddingHeadsK*heads*9/16, - ) - case "command-r": - fullOffload = max( - 4*batch*(embedding+vocab), - 4*batch*(2+4*embedding+context*(1+heads)), - ) - - partialOffload = max( - 4*batch*(embedding+vocab)+embedding*vocab*105/128, - 4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16, - ) - case "qwen2": - fullOffload = max( - 4*batch*(embedding+vocab), - 4*batch*(1+2*embedding+context+context*heads), - ) - - partialOffload = max( - 4*batch*(embedding+vocab)+embedding*vocab*105/128, - 4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)), - ) - case "phi2": - fullOffload = max( - 4*batch*(embedding+vocab), - 4*batch*(1+4*embedding+context+context*heads), - ) - - partialOffload = max( - 4*batch*(2*embedding+vocab)+embedding*vocab*105/128, - 4*batch*(2+3*embedding+context+context*heads), - ) - case "stablelm": - fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2) - partialOffload = max( - 4*batch*(vocab+2*embedding), - fullOffload, - ) - case "deepseek2": - fullOffload = max( - 4*batch*(3*embedding+vocab), - 4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV), - ) - - partialOffload = max( - 4*batch*(3*embedding+vocab)+embedding*vocab*105/128, - 4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16, - ) - case "chatglm": - fullOffload = 4 * batch * (embedding + vocab) - partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128 - if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok { - fullOffload = max( - fullOffload, - 4*batch*(2+ - 2*embedding+ - context+ - context*heads+ - embeddingHeadsK*heads+ - qkvBias.Shape[0]), - ) - - partialOffload = max( - partialOffload, - 4*batch*(1+ - 2*embedding+ - embeddingHeadsK*heads+ - context+ - context*heads)+ - 4*embeddingHeadsK*context+ - 4*context*embeddingHeadsK+ - 4*qkvBias.Shape[0], - ) - } - } - - return -} - -// SupportsKVCacheType checks if the requested cache type is supported -func (ggml GGML) SupportsKVCacheType(cacheType string) bool { - validKVCacheTypes := []string{"f16", "q8_0", "q4_0"} - return slices.Contains(validKVCacheTypes, cacheType) -} - -// SupportsFlashAttention checks if the model supports flash attention -func (ggml GGML) SupportsFlashAttention() bool { - _, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())] - if isEmbedding { - return false - } - - // Check head counts match and are non-zero - headCountK := ggml.KV().EmbeddingHeadCountK() - headCountV := ggml.KV().EmbeddingHeadCountV() - return headCountK != 0 && headCountV != 0 && headCountK == headCountV -} - -// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type -func kvCacheBytesPerElement(cacheType string) float64 { - switch cacheType { - case "q8_0": - return 1 // 1/2 of fp16 - case "q4_0": - return 0.5 // 1/4 of fp16 - default: - return 2 // f16 (default) - } -} diff --git a/llm/ggml_test.go b/llm/ggml_test.go deleted file mode 100644 index 006c3ded8..000000000 --- a/llm/ggml_test.go +++ /dev/null @@ -1 +0,0 @@ -package llm diff --git a/llm/gguf.go b/llm/gguf.go deleted file mode 100644 index c7a95490f..000000000 --- a/llm/gguf.go +++ /dev/null @@ -1,662 +0,0 @@ -package llm - -import ( - "bytes" - "cmp" - "encoding/binary" - "encoding/json" - "fmt" - "io" - "log/slog" - "slices" - "strings" - - "golang.org/x/exp/maps" -) - -type containerGGUF struct { - ByteOrder binary.ByteOrder - - Version uint32 - - V1 struct { - NumTensor uint32 - NumKV uint32 - } - - V2 struct { - NumTensor uint64 - NumKV uint64 - } - - V3 struct { - NumTensor uint64 - NumKV uint64 - } - - maxArraySize int -} - -func (c *containerGGUF) canCollectArray(size int) bool { - return c.maxArraySize < 0 || size <= c.maxArraySize -} - -func (c *containerGGUF) Name() string { - return "gguf" -} - -func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) { - if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil { - return nil, err - } - - var err error - switch c.Version { - case 1: - err = binary.Read(rs, c.ByteOrder, &c.V1) - case 2: - err = binary.Read(rs, c.ByteOrder, &c.V2) - default: - err = binary.Read(rs, c.ByteOrder, &c.V3) - } - if err != nil { - return nil, err - } - - model := newGGUF(c) - if err := model.Decode(rs); err != nil { - return nil, err - } - - return model, nil -} - -const ( - ggufTypeUint8 uint32 = iota - ggufTypeInt8 - ggufTypeUint16 - ggufTypeInt16 - ggufTypeUint32 - ggufTypeInt32 - ggufTypeFloat32 - ggufTypeBool - ggufTypeString - ggufTypeArray - ggufTypeUint64 - ggufTypeInt64 - ggufTypeFloat64 -) - -type gguf struct { - *containerGGUF - - kv KV - tensors []*Tensor - - parameters uint64 - tensorOffset uint64 - - scratch [16 << 10]byte -} - -func newGGUF(container *containerGGUF) *gguf { - return &gguf{ - containerGGUF: container, - kv: make(KV), - } -} - -func (llm *gguf) KV() KV { - return llm.kv -} - -func (llm *gguf) Tensors() *Tensors { - return &Tensors{ - Items: llm.tensors, - Offset: llm.tensorOffset, - } -} - -func (llm *gguf) numTensor() uint64 { - switch llm.Version { - case 1: - return uint64(llm.V1.NumTensor) - case 2: - return llm.V2.NumTensor - default: - return llm.V3.NumTensor - } -} - -func (llm *gguf) numKV() uint64 { - switch llm.Version { - case 1: - return uint64(llm.V1.NumKV) - case 2: - return llm.V2.NumKV - default: - return llm.V3.NumKV - } -} - -func (llm *gguf) Decode(rs io.ReadSeeker) error { - // decode key-values - for i := 0; uint64(i) < llm.numKV(); i++ { - k, err := readGGUFString(llm, rs) - if err != nil { - return err - } - - t, err := readGGUF[uint32](llm, rs) - if err != nil { - return err - } - - var v any - switch t { - case ggufTypeUint8: - v, err = readGGUF[uint8](llm, rs) - case ggufTypeInt8: - v, err = readGGUF[int8](llm, rs) - case ggufTypeUint16: - v, err = readGGUF[uint16](llm, rs) - case ggufTypeInt16: - v, err = readGGUF[int16](llm, rs) - case ggufTypeUint32: - v, err = readGGUF[uint32](llm, rs) - case ggufTypeInt32: - v, err = readGGUF[int32](llm, rs) - case ggufTypeUint64: - v, err = readGGUF[uint64](llm, rs) - case ggufTypeInt64: - v, err = readGGUF[int64](llm, rs) - case ggufTypeFloat32: - v, err = readGGUF[float32](llm, rs) - case ggufTypeFloat64: - v, err = readGGUF[float64](llm, rs) - case ggufTypeBool: - v, err = readGGUF[bool](llm, rs) - case ggufTypeString: - v, err = readGGUFString(llm, rs) - case ggufTypeArray: - v, err = readGGUFArray(llm, rs) - default: - return fmt.Errorf("invalid type: %d", t) - } - - if err != nil { - return err - } - - llm.kv[k] = v - } - - // decode tensors - for range llm.numTensor() { - name, err := readGGUFString(llm, rs) - if err != nil { - return fmt.Errorf("failed to read tensor name: %w", err) - } - - // dims is the number of dimensions in the tensor - dims, err := readGGUF[uint32](llm, rs) - if err != nil { - return fmt.Errorf("failed to read tensor dimensions: %w", err) - } - - shape := make([]uint64, dims) - for i := 0; uint32(i) < dims; i++ { - shape[i], err = readGGUF[uint64](llm, rs) - if err != nil { - return fmt.Errorf("failed to read tensor shape: %w", err) - } - } - - kind, err := readGGUF[uint32](llm, rs) - if err != nil { - return fmt.Errorf("failed to read tensor kind: %w", err) - } - - offset, err := readGGUF[uint64](llm, rs) - if err != nil { - return fmt.Errorf("failed to read tensor offset: %w", err) - } - - tensor := Tensor{ - Name: name, - Kind: kind, - Offset: offset, - Shape: shape[:], - } - - llm.tensors = append(llm.tensors, &tensor) - llm.parameters += tensor.parameters() - } - - // patch KV with parameter count - llm.kv["general.parameter_count"] = llm.parameters - - alignment, ok := llm.kv["general.alignment"].(uint32) - if !ok { - alignment = 32 - } - - offset, err := rs.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - padding := ggufPadding(offset, int64(alignment)) - llm.tensorOffset = uint64(offset + padding) - - for _, tensor := range llm.tensors { - offset, err := rs.Seek(0, io.SeekCurrent) - if err != nil { - return fmt.Errorf("failed to get current offset: %w", err) - } - - padding := ggufPadding(offset, int64(alignment)) - if _, err := rs.Seek(padding, io.SeekCurrent); err != nil { - return fmt.Errorf("failed to seek to init padding: %w", err) - } - - if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil { - return fmt.Errorf("failed to seek to tensor: %w", err) - } - } - - return nil -} - -func readGGUF[T any](llm *gguf, r io.Reader) (T, error) { - var t T - err := binary.Read(r, llm.ByteOrder, &t) - return t, err -} - -func writeGGUF[V any](w io.Writer, t uint32, v V) error { - if err := binary.Write(w, binary.LittleEndian, t); err != nil { - return err - } - - return binary.Write(w, binary.LittleEndian, v) -} - -func readGGUFV1String(llm *gguf, r io.Reader) (string, error) { - var length uint64 - if err := binary.Read(r, llm.ByteOrder, &length); err != nil { - return "", err - } - - var b bytes.Buffer - if _, err := io.CopyN(&b, r, int64(length)); err != nil { - return "", err - } - - // gguf v1 strings are null-terminated - b.Truncate(b.Len() - 1) - - return b.String(), nil -} - -func discardGGUFString(llm *gguf, r io.Reader) error { - buf := llm.scratch[:8] - _, err := io.ReadFull(r, buf) - if err != nil { - return err - } - - size := int(llm.ByteOrder.Uint64(buf)) - for size > 0 { - n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))]) - if err != nil { - return err - } - size -= n - } - return nil -} - -func readGGUFString(llm *gguf, r io.Reader) (string, error) { - if llm.Version == 1 { - return readGGUFV1String(llm, r) - } - - buf := llm.scratch[:8] - _, err := io.ReadFull(r, buf) - if err != nil { - return "", err - } - - length := int(llm.ByteOrder.Uint64(buf)) - if length > len(llm.scratch) { - buf = make([]byte, length) - } else { - buf = llm.scratch[:length] - } - clear(buf) - - _, err = io.ReadFull(r, buf) - if err != nil { - return "", err - } - return string(buf), nil -} - -func writeGGUFString(w io.Writer, s string) error { - if err := binary.Write(w, binary.LittleEndian, ggufTypeString); err != nil { - return err - } - - if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil { - return err - } - - _, err := io.Copy(w, strings.NewReader(s)) - return err -} - -type array struct { - size int - values []any -} - -func (a *array) MarshalJSON() ([]byte, error) { - return json.Marshal(a.values) -} - -func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) { - t, err := readGGUF[uint32](llm, r) - if err != nil { - return nil, err - } - - n, err := readGGUF[uint32](llm, r) - if err != nil { - return nil, err - } - - a := &array{size: int(n)} - if llm.canCollectArray(int(n)) { - a.values = make([]any, 0, int(n)) - } - - for i := range n { - var e any - switch t { - case ggufTypeUint8: - e, err = readGGUF[uint8](llm, r) - case ggufTypeInt8: - e, err = readGGUF[int8](llm, r) - case ggufTypeUint16: - e, err = readGGUF[uint16](llm, r) - case ggufTypeInt16: - e, err = readGGUF[int16](llm, r) - case ggufTypeUint32: - e, err = readGGUF[uint32](llm, r) - case ggufTypeInt32: - e, err = readGGUF[int32](llm, r) - case ggufTypeUint64: - e, err = readGGUF[uint64](llm, r) - case ggufTypeInt64: - e, err = readGGUF[int64](llm, r) - case ggufTypeFloat32: - e, err = readGGUF[float32](llm, r) - case ggufTypeFloat64: - e, err = readGGUF[float64](llm, r) - case ggufTypeBool: - e, err = readGGUF[bool](llm, r) - case ggufTypeString: - e, err = readGGUFV1String(llm, r) - default: - return nil, fmt.Errorf("invalid array type: %d", t) - } - if err != nil { - return nil, err - } - - if a.values != nil { - a.values[i] = e - } - } - - return a, nil -} - -func readGGUFArray(llm *gguf, r io.Reader) (*array, error) { - if llm.Version == 1 { - return readGGUFV1Array(llm, r) - } - - t, err := readGGUF[uint32](llm, r) - if err != nil { - return nil, err - } - - n, err := readGGUF[uint64](llm, r) - if err != nil { - return nil, err - } - - a := &array{size: int(n)} - if llm.canCollectArray(int(n)) { - a.values = make([]any, int(n)) - } - - for i := range n { - var e any - switch t { - case ggufTypeUint8: - e, err = readGGUF[uint8](llm, r) - case ggufTypeInt8: - e, err = readGGUF[int8](llm, r) - case ggufTypeUint16: - e, err = readGGUF[uint16](llm, r) - case ggufTypeInt16: - e, err = readGGUF[int16](llm, r) - case ggufTypeUint32: - e, err = readGGUF[uint32](llm, r) - case ggufTypeInt32: - e, err = readGGUF[int32](llm, r) - case ggufTypeUint64: - e, err = readGGUF[uint64](llm, r) - case ggufTypeInt64: - e, err = readGGUF[int64](llm, r) - case ggufTypeFloat32: - e, err = readGGUF[float32](llm, r) - case ggufTypeFloat64: - e, err = readGGUF[float64](llm, r) - case ggufTypeBool: - e, err = readGGUF[bool](llm, r) - case ggufTypeString: - if a.values != nil { - e, err = readGGUFString(llm, r) - } else { - err = discardGGUFString(llm, r) - } - default: - return nil, fmt.Errorf("invalid array type: %d", t) - } - if err != nil { - return nil, err - } - - if a.values != nil { - a.values[i] = e - } - } - - return a, nil -} - -// writeGGUFArray writes a slice s of type E to the write with a gguf type of t -func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error { - if err := binary.Write(w, binary.LittleEndian, ggufTypeArray); err != nil { - return err - } - - if err := binary.Write(w, binary.LittleEndian, t); err != nil { - return err - } - - if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil { - return err - } - - return binary.Write(w, binary.LittleEndian, s) -} - -func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error { - if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil { - return err - } - - keys := maps.Keys(kv) - slices.Sort(keys) - - for _, key := range keys { - if err := ggufWriteKV(ws, key, kv[key]); err != nil { - return err - } - } - - slices.SortStableFunc(ts, func(a, b Tensor) int { - if i, j := a.block(), b.block(); i < 0 && j > 0 { - return 1 - } else if i > 0 && j < 0 { - return -1 - } else { - return cmp.Compare(i, j) - } - }) - - var s uint64 - for _, t := range ts { - t.Offset = s - if err := ggufWriteTensorInfo(ws, t); err != nil { - return err - } - s += t.Size() - } - - var alignment int64 = 32 - for _, t := range ts { - if err := ggufWriteTensor(ws, t, alignment); err != nil { - return err - } - } - - return nil -} - -func ggufWriteKV(ws io.WriteSeeker, k string, v any) error { - slog.Debug(k, "type", fmt.Sprintf("%T", v)) - if err := binary.Write(ws, binary.LittleEndian, uint64(len(k))); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, []byte(k)); err != nil { - return err - } - - var err error - switch v := v.(type) { - case uint32: - err = writeGGUF(ws, ggufTypeUint32, v) - case float32: - err = writeGGUF(ws, ggufTypeFloat32, v) - case bool: - err = writeGGUF(ws, ggufTypeBool, v) - case string: - err = writeGGUFString(ws, v) - case []int32: - err = writeGGUFArray(ws, ggufTypeInt32, v) - case []uint32: - err = writeGGUFArray(ws, ggufTypeUint32, v) - case []float32: - err = writeGGUFArray(ws, ggufTypeFloat32, v) - case []string: - if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil { - return err - } - - for _, e := range v { - if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil { - return err - } - } - default: - return fmt.Errorf("improper type for '%s'", k) - } - - return err -} - -func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error { - slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset) - if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Shape))); err != nil { - return err - } - - for i := range len(t.Shape) { - if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil { - return err - } - } - - if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil { - return err - } - - return binary.Write(ws, binary.LittleEndian, t.Offset) -} - -func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error { - offset, err := ws.Seek(0, io.SeekCurrent) - if err != nil { - return err - } - - if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil { - return err - } - - _, err = t.WriteTo(ws) - return err -} - -func ggufPadding(offset, align int64) int64 { - return (align - offset%align) % align -} diff --git a/llm/memory.go b/llm/memory.go index 766e9e444..fdfe798f9 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -11,18 +11,19 @@ import ( "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" + "github.com/ollama/ollama/fs/ggml" ) // This algorithm looks for a complete fit to determine if we need to unload other models -func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) { +func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) { // Split up the GPUs by type and try them var estimatedVRAM uint64 for _, gpus := range allGpus.ByLibrary() { var layerCount int - estimate := EstimateGPULayers(gpus, ggml, projectors, opts) + estimate := EstimateGPULayers(gpus, f, projectors, opts) layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize if opts.NumGPU < 0 { - if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { + if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) { return true, estimatedVRAM } } else { @@ -70,7 +71,7 @@ type MemoryEstimate struct { // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // The GPUs provided must all be the same Library -func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate { +func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate { // Graph size for a partial offload, applies to all GPUs var graphPartialOffload uint64 @@ -115,33 +116,31 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts.NumCtx = max(opts.NumCtx, 2048) } - layers := ggml.Tensors().Layers() + layers := f.Tensors().Layers() // add one layer worth of memory as a buffer if blk0, ok := layers["blk.0"]; ok { - layerSize = blk0.size() + layerSize = blk0.Size() } else { slog.Warn("model missing blk.0 layer size") } - fa := envconfig.FlashAttention() && - discover.GetGPUInfo().FlashAttentionSupported() && - ggml.SupportsFlashAttention() - var kvct string - if fa { + if envconfig.FlashAttention() && + discover.GetGPUInfo().FlashAttentionSupported() && + f.SupportsFlashAttention() { requested := strings.ToLower(envconfig.KvCacheType()) - if requested != "" && ggml.SupportsKVCacheType(requested) { + if requested != "" && f.SupportsKVCacheType(requested) { kvct = requested } } - kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct) + kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct) // KV is proportional to the number of layers - layerSize += kv / ggml.KV().BlockCount() + layerSize += kv / f.KV().BlockCount() if graphPartialOffload == 0 { - graphPartialOffload = ggml.KV().GQA() * kv / 6 + graphPartialOffload = f.KV().GQA() * kv / 6 } if graphFullOffload == 0 { graphFullOffload = graphPartialOffload @@ -156,12 +155,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, } if layer, ok := layers["output_norm"]; ok { - memoryLayerOutput += layer.size() + memoryLayerOutput += layer.Size() } if layer, ok := layers["output"]; ok { - memoryLayerOutput += layer.size() + memoryLayerOutput += layer.Size() } else if layer, ok := layers["token_embd"]; ok { - memoryLayerOutput += layer.size() + memoryLayerOutput += layer.Size() } // Output layer handled at the end if we have space @@ -211,11 +210,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, } // For all the layers, find where they can fit on the GPU(s) - for i := range int(ggml.KV().BlockCount()) { + for i := range int(f.KV().BlockCount()) { // Some models have inconsistent layer sizes if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { - layerSize = blk.size() - layerSize += kv / ggml.KV().BlockCount() + layerSize = blk.Size() + layerSize += kv / f.KV().BlockCount() } memoryWeights += layerSize @@ -238,10 +237,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, } } } - if layerCount >= int(ggml.KV().BlockCount()) { + if layerCount >= int(f.KV().BlockCount()) { fullyLoaded = true } else { - for i := layerCount; i < int(ggml.KV().BlockCount()); i++ { + for i := layerCount; i < int(f.KV().BlockCount()); i++ { overflow += layerSize } } @@ -259,7 +258,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, } } - if layerCount < int(ggml.KV().BlockCount())+1 { + if layerCount < int(f.KV().BlockCount())+1 { fullyLoaded = false overflow += memoryLayerOutput } @@ -311,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, inferenceLibrary: gpus[0].Library, layersRequested: opts.NumGPU, - layersModel: int(ggml.KV().BlockCount()) + 1, + layersModel: int(f.KV().BlockCount()) + 1, availableList: availableList, kv: kv, allocationsList: allocationsList, @@ -409,13 +408,13 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) { } defer file.Close() - ggml, _, err := DecodeGGML(file, 0) + ggml, _, err := ggml.Decode(file, 0) if err != nil { return 0, 0 } for _, layer := range ggml.Tensors().Layers() { - weights += layer.size() + weights += layer.Size() } switch arch := ggml.KV().Architecture(); arch { diff --git a/llm/memory_test.go b/llm/memory_test.go index 04abaabee..e49d25410 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -11,6 +11,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/discover" + "github.com/ollama/ollama/fs/ggml" ) func TestEstimateGPULayers(t *testing.T) { @@ -23,7 +24,7 @@ func TestEstimateGPULayers(t *testing.T) { defer f.Close() inputLayerCount := 5 - tensors := []Tensor{ + tensors := []ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, @@ -32,7 +33,7 @@ func TestEstimateGPULayers(t *testing.T) { {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, } assert.Len(t, tensors, inputLayerCount+1) - err = WriteGGUF(f, KV{ + err = ggml.WriteGGUF(f, ggml.KV{ "general.architecture": "llama", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), diff --git a/llm/server.go b/llm/server.go index 37c204678..134f5d8a3 100644 --- a/llm/server.go +++ b/llm/server.go @@ -28,6 +28,7 @@ import ( "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llama" "github.com/ollama/ollama/runners" ) @@ -72,7 +73,7 @@ type llmServer struct { // It collects array values for arrays with a size less than or equal to // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If // the maxArraySize is negative, all arrays are collected. -func LoadModel(model string, maxArraySize int) (*GGML, error) { +func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { if _, err := os.Stat(model); err != nil { return nil, err } @@ -83,13 +84,13 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) { } defer f.Close() - ggml, _, err := DecodeGGML(f, maxArraySize) + ggml, _, err := ggml.Decode(f, maxArraySize) return ggml, err } // NewLlamaServer will run a server for the given GPUs // The gpu list must be a single family. -func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { +func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var err error var cpuRunner string var estimate MemoryEstimate @@ -109,9 +110,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter } if len(gpus) == 1 && gpus[0].Library == "cpu" { cpuRunner = runners.ServerForCpu() - estimate = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate = EstimateGPULayers(gpus, f, projectors, opts) } else { - estimate = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate = EstimateGPULayers(gpus, f, projectors, opts) switch { case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory: @@ -212,7 +213,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter fa = false } - if fa && !ggml.SupportsFlashAttention() { + if fa && !f.SupportsFlashAttention() { slog.Warn("flash attention enabled but not supported by model") fa = false } @@ -225,7 +226,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter // Flash Attention also supports kv cache quantization // Enable if the requested and kv cache type is supported by the model - if kvct != "" && ggml.SupportsKVCacheType(kvct) { + if kvct != "" && f.SupportsKVCacheType(kvct) { params = append(params, "--kv-cache-type", kvct) } else { slog.Warn("kv cache type not supported by model", "type", kvct) @@ -238,7 +239,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter for _, g := range gpus { if g.Library == "metal" && uint64(opts.NumGPU) > 0 && - uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 { + uint64(opts.NumGPU) < f.KV().BlockCount()+1 { opts.UseMMap = new(bool) *opts.UseMMap = false } @@ -330,7 +331,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter estimate: estimate, numParallel: numParallel, sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: ggml.KV().BlockCount() + 1, + totalLayers: f.KV().BlockCount() + 1, gpus: gpus, done: make(chan error, 1), } diff --git a/ml/backend/ggml/backend.go b/ml/backend/ggml/backend.go index 302df5b8d..50a01e818 100644 --- a/ml/backend/ggml/backend.go +++ b/ml/backend/ggml/backend.go @@ -29,7 +29,7 @@ type Backend struct { } func New(r io.ReadSeeker) (ml.Backend, error) { - f, _, err := ggml.DecodeGGML(r, -1) + f, _, err := ggml.Decode(r, -1) if err != nil { return nil, err } diff --git a/server/images.go b/server/images.go index 29877db33..d9fc314a1 100644 --- a/server/images.go +++ b/server/images.go @@ -25,8 +25,8 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llama" - "github.com/ollama/ollama/llm" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/model" @@ -89,7 +89,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error { defer f.Close() // TODO(mxyng): decode the GGML into model to avoid doing this multiple times - ggml, _, err := llm.DecodeGGML(f, 0) + ggml, _, err := ggml.Decode(f, 0) if err != nil { slog.Error("couldn't decode ggml", "error", err) continue @@ -429,7 +429,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio baseLayer.MediaType == "application/vnd.ollama.image.model" && baseLayer.GGML != nil && baseLayer.GGML.Name() == "gguf" { - want, err := llm.ParseFileType(quantization) + want, err := ggml.ParseFileType(quantization) if err != nil { return err } @@ -465,7 +465,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio return err } - ggml, _, err := llm.DecodeGGML(temp, 0) + ggml, _, err := ggml.Decode(temp, 0) if err != nil { return err } diff --git a/server/model.go b/server/model.go index 4926d6ce2..94fdb3c20 100644 --- a/server/model.go +++ b/server/model.go @@ -18,7 +18,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/convert" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/model" ) @@ -27,7 +27,7 @@ var intermediateBlobs map[string]string = make(map[string]string) type layerGGML struct { Layer - *llm.GGML + *ggml.GGML } func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { @@ -67,7 +67,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe } defer blob.Close() - ggml, _, err := llm.DecodeGGML(blob, 0) + ggml, _, err := ggml.Decode(blob, 0) if err != nil { return nil, err } @@ -112,7 +112,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML switch command { case "adapter": - var baseModel *llm.GGML + var baseModel *ggml.GGML for _, l := range baseLayers { if l.GGML != nil { baseModel = l.GGML @@ -150,7 +150,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML } defer bin.Close() - ggml, _, err := llm.DecodeGGML(bin, 0) + ggml, _, err := ggml.Decode(bin, 0) if err != nil { return nil, err } @@ -184,7 +184,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, var offset int64 for offset < stat.Size() { - ggml, n, err := llm.DecodeGGML(file, 0) + ggml, n, err := ggml.Decode(file, 0) if errors.Is(err, io.EOF) { break } else if err != nil { @@ -263,7 +263,7 @@ func detectContentType(r io.Reader) (string, error) { return "", err } - if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" { + if contentType := ggml.DetectContentType(b.Bytes()); contentType != "" { return contentType, nil } diff --git a/server/model_test.go b/server/model_test.go index 47c4728ed..88192bf10 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -13,7 +13,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/template" ) @@ -148,7 +148,7 @@ func TestParseFromFileFromLayer(t *testing.T) { t.Fatalf("failed to open file: %v", err) } defer file.Close() - if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + if err := ggml.WriteGGUF(file, ggml.KV{"general.architecture": "gemma"}, []ggml.Tensor{}); err != nil { t.Fatalf("failed to write gguf: %v", err) } @@ -201,7 +201,7 @@ func TestParseLayerFromCopy(t *testing.T) { defer file2.Close() for range 5 { - if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil { + if err := ggml.WriteGGUF(file2, ggml.KV{"general.architecture": "gemma"}, []ggml.Tensor{}); err != nil { t.Fatalf("failed to write gguf: %v", err) } } diff --git a/server/routes.go b/server/routes.go index 593d372e5..2693b767b 100644 --- a/server/routes.go +++ b/server/routes.go @@ -29,6 +29,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" @@ -870,7 +871,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { return resp, nil } -func getKVData(digest string, verbose bool) (llm.KV, error) { +func getKVData(digest string, verbose bool) (ggml.KV, error) { maxArraySize := 0 if verbose { maxArraySize = -1 diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 09521753f..497a68751 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -16,12 +16,12 @@ import ( "github.com/gin-gonic/gin" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) var stream bool = false -func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string { +func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) string { t.Helper() f, err := os.CreateTemp(t.TempDir(), "") @@ -30,7 +30,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string { } defer f.Close() - if err := llm.WriteGGUF(f, kv, ti); err != nil { + if err := ggml.WriteGGUF(f, kv, ti); err != nil { t.Fatal(err) } @@ -581,7 +581,7 @@ func TestCreateDetectTemplate(t *testing.T) { t.Run("matched", func(t *testing.T) { w := createRequest(t, s.CreateHandler, api.CreateRequest{ Name: "test", - Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ + Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, ggml.KV{ "tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", }, nil)), Stream: &stream, diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index 4bde55bb4..1ab193f7e 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -17,6 +17,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/discover" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" ) @@ -46,8 +47,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error return } -func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { - return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { +func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { + return func(gpus discover.GpuInfoList, model string, f *ggml.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return mock, nil } } @@ -77,7 +78,7 @@ func TestGenerateChat(t *testing.T) { getGpuFn: discover.GetGPUInfo, getCpuFn: discover.GetCPUInfo, reschedDelay: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { + loadFn: func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -101,7 +102,7 @@ func TestGenerateChat(t *testing.T) { {{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} {{- end }} {{ end }}""" -`, createBinFile(t, llm.KV{ +`, createBinFile(t, ggml.KV{ "general.architecture": "llama", "llama.block_count": uint32(1), "llama.context_length": uint32(8192), @@ -111,7 +112,7 @@ func TestGenerateChat(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []llm.Tensor{ + }, []ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -156,10 +157,10 @@ func TestGenerateChat(t *testing.T) { t.Run("missing capabilities chat", func(t *testing.T) { w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", - Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ + Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []llm.Tensor{})), + }, []ggml.Tensor{})), Stream: &stream, }) @@ -610,7 +611,7 @@ func TestGenerate(t *testing.T) { getGpuFn: discover.GetGPUInfo, getCpuFn: discover.GetCPUInfo, reschedDelay: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { + loadFn: func(req *LlmRequest, _ *ggml.GGML, gpus discover.GpuInfoList, numParallel int) { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -629,7 +630,7 @@ func TestGenerate(t *testing.T) { {{- if .System }}System: {{ .System }} {{ end }} {{- if .Prompt }}User: {{ .Prompt }} {{ end }} {{- if .Response }}Assistant: {{ .Response }} {{ end }}""" -`, createBinFile(t, llm.KV{ +`, createBinFile(t, ggml.KV{ "general.architecture": "llama", "llama.block_count": uint32(1), "llama.context_length": uint32(8192), @@ -639,7 +640,7 @@ func TestGenerate(t *testing.T) { "tokenizer.ggml.tokens": []string{""}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []llm.Tensor{ + }, []ggml.Tensor{ {Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, {Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))}, @@ -684,10 +685,10 @@ func TestGenerate(t *testing.T) { t.Run("missing capabilities generate", func(t *testing.T) { w := createRequest(t, s.CreateHandler, api.CreateRequest{ Model: "bert", - Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{ + Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, ggml.KV{ "general.architecture": "bert", "bert.pooling_type": uint32(0), - }, []llm.Tensor{})), + }, []ggml.Tensor{})), Stream: &stream, }) diff --git a/server/routes_test.go b/server/routes_test.go index 1daf36f1a..fb68d1116 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -21,7 +21,7 @@ import ( "unicode" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/types/model" @@ -612,8 +612,8 @@ func TestShow(t *testing.T) { Name: "show-model", Modelfile: fmt.Sprintf( "FROM %s\nFROM %s", - createBinFile(t, llm.KV{"general.architecture": "test"}, nil), - createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil), + createBinFile(t, ggml.KV{"general.architecture": "test"}, nil), + createBinFile(t, ggml.KV{"general.type": "projector", "general.architecture": "clip"}, nil), ), }) diff --git a/server/sched.go b/server/sched.go index 0da84182e..563f2aad7 100644 --- a/server/sched.go +++ b/server/sched.go @@ -18,6 +18,7 @@ import ( "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" ) @@ -41,8 +42,8 @@ type Scheduler struct { loaded map[string]*runnerRef loadedMu sync.Mutex - loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) - newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) + loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) + newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) getGpuFn func() discover.GpuInfoList getCpuFn func() discover.GpuInfoList reschedDelay time.Duration @@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm }() } -func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) { +func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) { if numParallel < 1 { numParallel = 1 } @@ -417,12 +418,12 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL if req.sessionDuration != nil { sessionDuration = req.sessionDuration.Duration } - llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) + llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) if err != nil { // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to // check for model compatibility - if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") { + if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") { err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName) } slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err) @@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool { // If the model can not be fit fully within the available GPU(s) nil is returned // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust // opts.NumCtx accordingly -func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { +func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { var estimatedVRAM uint64 var numParallelToTry []int @@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu req.opts.NumCtx = req.origNumCtx * p if !envconfig.SchedSpread() { for _, g := range sgl { - if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) *numParallel = p return []discover.GpuInfo{g} @@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu // Now try all the GPUs for _, p := range numParallelToTry { req.opts.NumCtx = req.origNumCtx * p - if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { + if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) *numParallel = p return sgl @@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu } // If multiple Libraries are detected, pick the Library which loads the most layers for the model -func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { +func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 req.opts.NumCtx = req.origNumCtx @@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover. var bestEstimate uint64 var bestFit int for i, gl := range byLibrary { - _, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts) + _, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts) if estimatedVRAM > bestEstimate { bestEstimate = estimatedVRAM bestFit = i @@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) { // If other runners are loaded, make sure the pending request will fit in system memory // If not, pick a runner to unload, else return nil and the request can be loaded -func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef { +func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef { slog.Debug("evaluating if CPU model load will fit in available system memory") - estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts) + estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts) if estimate.TotalSize <= gpus[0].FreeMemory { slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory)) return nil diff --git a/server/sched_test.go b/server/sched_test.go index 72baf15a2..81f4a95f6 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -15,6 +15,7 @@ import ( "github.com/ollama/ollama/app/lifecycle" "github.com/ollama/ollama/discover" "github.com/ollama/ollama/format" + "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" ) @@ -37,7 +38,7 @@ func TestLoad(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) defer done() s := InitScheduler(ctx) - var ggml *llm.GGML // value not used in tests + var f *ggml.GGML // value not used in tests req := &LlmRequest{ ctx: ctx, model: &Model{ModelPath: "foo"}, @@ -47,11 +48,11 @@ func TestLoad(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Second}, } // Fail to load model first - s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return nil, errors.New("something failed to load model blah") } gpus := discover.GpuInfoList{} - s.load(req, ggml, gpus, 0) + s.load(req, f, gpus, 0) require.Empty(t, req.successCh) require.Len(t, req.errCh, 1) s.loadedMu.Lock() @@ -61,10 +62,10 @@ func TestLoad(t *testing.T) { require.Contains(t, err.Error(), "this model may be incompatible") server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} - s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return server, nil } - s.load(req, ggml, gpus, 0) + s.load(req, f, gpus, 0) select { case err := <-req.errCh: require.NoError(t, err) @@ -78,7 +79,7 @@ func TestLoad(t *testing.T) { req.model.ModelPath = "dummy_model_path" server.waitResp = errors.New("wait failure") - s.load(req, ggml, gpus, 0) + s.load(req, f, gpus, 0) select { case err := <-req.errCh: require.Contains(t, err.Error(), "wait failure") @@ -99,10 +100,10 @@ type reqBundle struct { ctxDone func() srv *mockLlm req *LlmRequest - ggml *llm.GGML + f *ggml.GGML } -func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { +func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return scenario.srv, nil } @@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est require.NoError(t, err) defer f.Close() - require.NoError(t, llm.WriteGGUF(f, llm.KV{ + require.NoError(t, ggml.WriteGGUF(f, ggml.KV{ "general.architecture": "llama", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), @@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est "tokenizer.ggml.tokens": []string{" "}, "tokenizer.ggml.scores": []float32{0}, "tokenizer.ggml.token_type": []int32{0}, - }, []llm.Tensor{ + }, []ggml.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, })) @@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est fname := f.Name() model := &Model{Name: modelName, ModelPath: fname} - b.ggml, err = llm.LoadModel(model.ModelPath, 0) + b.f, err = llm.LoadModel(model.ModelPath, 0) require.NoError(t, err) if duration == nil { @@ -174,7 +175,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) { a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}) b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}) b.req.model = a.req.model - b.ggml = a.ggml + b.f = a.f s.newServerFn = a.newServer slog.Info("a") @@ -218,7 +219,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) { b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}) tmpModel := *a.req.model b.req.model = &tmpModel - b.ggml = a.ggml + b.f = a.f s.newServerFn = a.newServer slog.Info("a") @@ -419,13 +420,13 @@ func TestExpireRunner(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Minute}, } - var ggml *llm.GGML + var f *ggml.GGML gpus := discover.GpuInfoList{} server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} - s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return server, nil } - s.load(req, ggml, gpus, 0) + s.load(req, f, gpus, 0) select { case err := <-req.errCh: @@ -729,9 +730,9 @@ func TestHomogeneousGPUs(t *testing.T) { } s.getCpuFn = getCpuFn a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}) - s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { require.Len(t, gpus, 1) - return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel) + return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel) } slog.Info("a") s.pendingReqCh <- a.req diff --git a/template/template_test.go b/template/template_test.go index 616bef6a8..ba1046500 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -14,7 +14,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/fs/ggml" ) func TestNamed(t *testing.T) { @@ -33,7 +33,7 @@ func TestNamed(t *testing.T) { for k, v := range ss { t.Run(k, func(t *testing.T) { - kv := llm.KV{"tokenizer.chat_template": v} + kv := ggml.KV{"tokenizer.chat_template": v} s := kv.ChatTemplate() r, err := Named(s) if err != nil {