diff --git a/convert/convert_gemma3.go b/convert/convert_gemma3.go index fab5fcd85..be65fdb0a 100644 --- a/convert/convert_gemma3.go +++ b/convert/convert_gemma3.go @@ -10,6 +10,7 @@ type gemma3Model struct { gemmaModel Architecture string TextModel struct { + HeadDim uint32 `json:"head_dim"` HiddenSize uint32 `json:"hidden_size"` HiddenLayers uint32 `json:"num_hidden_layers"` IntermediateSize uint32 `json:"intermediate_size"` @@ -36,15 +37,45 @@ type gemma3Model struct { SlidingWindow uint32 `json:"sliding_window"` } +const ( + gemma4BLayerCount = 34 + gemma12BLayerCount = 48 + gemma27BLayerCount = 62 +) + func (p *gemma3Model) KV(t *Tokenizer) ggml.KV { kv := p.ModelParameters.KV(t) kv["general.architecture"] = "gemma3" + numBlocks := cmp.Or(p.HiddenLayers, p.TextModel.HiddenLayers) + kv["gemma3.block_count"] = numBlocks + + var ( + numHeads uint32 + numKVHeads uint32 + ) + + switch numBlocks { + case gemma4BLayerCount: + numHeads = 8 + numKVHeads = 4 + case gemma12BLayerCount: + numHeads = 16 + numKVHeads = 8 + case gemma27BLayerCount: + numHeads = 32 + numKVHeads = 16 + default: + numHeads = p.NumAttentionHeads + numKVHeads = p.NumKeyValueHeads + } + + kv["gemma3.attention.head_count"] = numHeads + kv["gemma3.attention.head_count_kv"] = numKVHeads + switch p.Architecture { case "Gemma3ForCausalLM": kv["gemma3.context_length"] = p.MaxPositionEmbeddings - kv["gemma3.attention.head_count"] = p.NumAttentionHeads - kv["gemma3.attention.head_count_kv"] = p.NumKeyValueHeads kv["gemma3.text.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS kv["gemma3.attention.key_length"] = p.HeadDim kv["gemma3.attention.value_length"] = p.HeadDim @@ -53,11 +84,9 @@ func (p *gemma3Model) KV(t *Tokenizer) ggml.KV { kv["gemma3.text.rope.local.freq_base"] = p.RopeLocalTheta kv["gemma3.text.rope.global.freq_base"] = p.RopeGlobalTheta kv["gemma3.embedding_length"] = p.HiddenSize - kv["gemma3.block_count"] = p.HiddenLayers kv["gemma3.text.feed_forward_length"] = p.IntermediateSize default: kv["gemma3.embedding_length"] = p.TextModel.HiddenSize - kv["gemma3.block_count"] = p.TextModel.HiddenLayers kv["gemma3.text.feed_forward_length"] = p.TextModel.IntermediateSize kv["gemma3.text.attention.sliding_window"] = p.TextModel.SlidingWindow kv["gemma3.vision.block_count"] = p.VisionModel.NumHiddenLayers @@ -68,11 +97,10 @@ func (p *gemma3Model) KV(t *Tokenizer) ggml.KV { kv["gemma3.vision.num_channels"] = cmp.Or(p.VisionModel.NumChannels, 3) kv["gemma3.vision.attention.head_count"] = p.VisionModel.NumAttentionHeads kv["gemma3.vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionModel.LayerNormEpsilon, 1e-6) + kv["gemma3.attention.key_length"] = cmp.Or(p.TextModel.HeadDim, 256) + kv["gemma3.attention.value_length"] = cmp.Or(p.TextModel.HeadDim, 256) } - kv["tokenizer.ggml.bos_token_id"] = uint32(2) - kv["tokenizer.ggml.eot_token_id"] = uint32(1) - return kv } diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go index 9a09bf1fe..765fb111f 100644 --- a/model/models/gemma3/model_text.go +++ b/model/models/gemma3/model_text.go @@ -33,7 +33,7 @@ type TextModel struct { const ( gemmaGlobalCacheCount = 6 - gemma27BLayerCount = 46 + gemma27BLayerCount = 62 ) const ( @@ -42,6 +42,8 @@ const ( ) func newTextModel(c ml.Config) *TextModel { + numBlocks := int(c.Uint("block_count")) + m := TextModel{ SentencePieceModel: model.NewSentencePieceModel( c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`), @@ -53,11 +55,11 @@ func newTextModel(c ml.Config) *TextModel { EOS: int32(c.Uint("tokenizer.ggml.eos_token_id")), }, ), - Layers: make([]TextLayer, c.Uint("block_count")), + Layers: make([]TextLayer, numBlocks), TextOptions: &TextOptions{ hiddenSize: int(c.Uint("embedding_length")), - numHeads: int(c.Uint("attention.head_count", 8)), - numKVHeads: int(c.Uint("attention.head_count_kv", 4)), + numHeads: int(c.Uint("attention.head_count")), + numKVHeads: int(c.Uint("attention.head_count_kv")), attnKeyLen: int(c.Uint("attention.key_length", 256)), attnValLen: int(c.Uint("attention.value_length", 256)), eps: c.Float("text.attention.layer_norm_rms_epsilon", 1e-06), @@ -68,6 +70,10 @@ func newTextModel(c ml.Config) *TextModel { }, } + if numBlocks == gemma27BLayerCount { + m.largeModelScaling = true + } + return &m } @@ -177,10 +183,6 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor hiddenState = hiddenState.Set(ctx, visionOutputs, offset*hiddenState.Stride(0)) } - if len(m.Layers) == gemma27BLayerCount { - m.TextOptions.largeModelScaling = true - } - for i, layer := range m.Layers { // gemma alternates between the sliding window (local) and causal (global) // kv cache every 6 layers