From 4ae4f47b16fd75e10aa12e0f6c22001df5410d78 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 20 Aug 2025 15:39:18 -0700 Subject: [PATCH] gpt-oss: convert from hugging face format (#11907) --- convert/convert_gptoss.go | 100 +++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 34 deletions(-) diff --git a/convert/convert_gptoss.go b/convert/convert_gptoss.go index bd362169bd..c5a691d3d5 100644 --- a/convert/convert_gptoss.go +++ b/convert/convert_gptoss.go @@ -15,19 +15,24 @@ import ( type gptossModel struct { ModelParameters - HiddenLayers uint32 `json:"num_hidden_layers"` - HiddenSize uint32 `json:"hidden_size"` - IntermediateSize uint32 `json:"intermediate_size"` - AttentionHeads uint32 `json:"num_attention_heads"` - KeyValueHeads uint32 `json:"num_key_value_heads"` - HeadDim uint32 `json:"head_dim"` - Experts uint32 `json:"num_experts"` - ExpertsPerToken uint32 `json:"experts_per_token"` - RMSNormEpsilon float32 `json:"rms_norm_eps"` - InitialContextLength uint32 `json:"initial_context_length"` - RopeTheta float32 `json:"rope_theta"` - RopeScalingFactor float32 `json:"rope_scaling_factor"` - SlidingWindow uint32 `json:"sliding_window"` + HiddenLayers uint32 `json:"num_hidden_layers"` + MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` + HiddenSize uint32 `json:"hidden_size"` + IntermediateSize uint32 `json:"intermediate_size"` + AttentionHeads uint32 `json:"num_attention_heads"` + KeyValueHeads uint32 `json:"num_key_value_heads"` + HeadDim uint32 `json:"head_dim"` + Experts uint32 `json:"num_experts"` + LocalExperts uint32 `json:"num_local_experts"` + ExpertsPerToken uint32 `json:"experts_per_token"` + RMSNormEpsilon float32 `json:"rms_norm_eps"` + InitialContextLength uint32 `json:"initial_context_length"` + RopeTheta float32 `json:"rope_theta"` + RopeScalingFactor float32 `json:"rope_scaling_factor"` + RopeScaling struct { + Factor float32 `json:"factor"` + } `json:"rope_scaling"` + SlidingWindow uint32 `json:"sliding_window"` } var _ ModelConverter = (*gptossModel)(nil) @@ -36,11 +41,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV { kv := m.ModelParameters.KV(t) kv["general.architecture"] = "gptoss" kv["general.file_type"] = uint32(4) - kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength)) + kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength))) kv["gptoss.block_count"] = m.HiddenLayers kv["gptoss.embedding_length"] = m.HiddenSize kv["gptoss.feed_forward_length"] = m.IntermediateSize - kv["gptoss.expert_count"] = m.Experts + kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts) kv["gptoss.expert_used_count"] = m.ExpertsPerToken kv["gptoss.attention.head_count"] = m.AttentionHeads kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads @@ -49,7 +54,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV { kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5) kv["gptoss.attention.sliding_window"] = m.SlidingWindow kv["gptoss.rope.freq_base"] = m.RopeTheta - kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor + kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor) kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|> kv["tokenizer.ggml.add_bos_token"] = false @@ -92,6 +97,11 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor { for name, mxfp4 := range mxfp4s { dims := mxfp4.blocks.Shape() + + if !strings.HasSuffix(name, ".weight") { + name += ".weight" + } + out = append(out, &ggml.Tensor{ Name: name, Kind: uint32(ggml.TensorTypeMXFP4), @@ -104,25 +114,47 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor { } func (m *gptossModel) Replacements() []string { - return []string{ - // noop replacements so other replacements will not be applied - ".blocks", ".blocks", - ".scales", ".scales", - // real replacements - "block", "blk", - "attn.norm", "attn_norm", - "attn.qkv", "attn_qkv", - "attn.sinks", "attn_sinks", - "attn.out", "attn_out", - "mlp.norm", "ffn_norm", - "mlp.gate", "ffn_gate_inp", - "mlp.mlp1_", "ffn_gate_up_exps.", - "mlp.mlp2_", "ffn_down_exps.", - "embedding", "token_embd", - "norm", "output_norm", - "unembedding", "output", - "scale", "weight", + var replacements []string + if m.MaxPositionEmbeddings > 0 { + // hf flavored model + replacements = []string{ + "lm_head", "output", + "model.embed_tokens", "token_embd", + "model.layers", "blk", + "input_layernorm", "attn_norm", + "self_attn.q_proj", "attn_q", + "self_attn.k_proj", "attn_k", + "self_attn.v_proj", "attn_v", + "self_attn.o_proj", "attn_out", + "self_attn.sinks", "attn_sinks", + "post_attention_layernorm", "ffn_norm", + "mlp.router", "ffn_gate_inp", + "mlp.experts.gate_up_proj_", "ffn_gate_up_exps.", + "mlp.experts.down_proj_", "ffn_down_exps.", + "model.norm", "output_norm", + } + } else { + replacements = []string{ + // noop replacements so other replacements will not be applied + ".blocks", ".blocks", + ".scales", ".scales", + // real replacements + "block", "blk", + "attn.norm", "attn_norm", + "attn.qkv", "attn_qkv", + "attn.sinks", "attn_sinks", + "attn.out", "attn_out", + "mlp.norm", "ffn_norm", + "mlp.gate", "ffn_gate_inp", + "mlp.mlp1_", "ffn_gate_up_exps.", + "mlp.mlp2_", "ffn_down_exps.", + "embedding", "token_embd", + "norm", "output_norm", + "unembedding", "output", + "scale", "weight", + } } + return replacements } type mxfp4 struct {