gpt-oss: convert from hugging face format (#11907)

This commit is contained in:
Michael Yang
2025-08-20 15:39:18 -07:00
committed by GitHub
parent 073fa31df5
commit 4ae4f47b16

View File

@@ -15,19 +15,24 @@ import (
type gptossModel struct { type gptossModel struct {
ModelParameters ModelParameters
HiddenLayers uint32 `json:"num_hidden_layers"` HiddenLayers uint32 `json:"num_hidden_layers"`
HiddenSize uint32 `json:"hidden_size"` MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
IntermediateSize uint32 `json:"intermediate_size"` HiddenSize uint32 `json:"hidden_size"`
AttentionHeads uint32 `json:"num_attention_heads"` IntermediateSize uint32 `json:"intermediate_size"`
KeyValueHeads uint32 `json:"num_key_value_heads"` AttentionHeads uint32 `json:"num_attention_heads"`
HeadDim uint32 `json:"head_dim"` KeyValueHeads uint32 `json:"num_key_value_heads"`
Experts uint32 `json:"num_experts"` HeadDim uint32 `json:"head_dim"`
ExpertsPerToken uint32 `json:"experts_per_token"` Experts uint32 `json:"num_experts"`
RMSNormEpsilon float32 `json:"rms_norm_eps"` LocalExperts uint32 `json:"num_local_experts"`
InitialContextLength uint32 `json:"initial_context_length"` ExpertsPerToken uint32 `json:"experts_per_token"`
RopeTheta float32 `json:"rope_theta"` RMSNormEpsilon float32 `json:"rms_norm_eps"`
RopeScalingFactor float32 `json:"rope_scaling_factor"` InitialContextLength uint32 `json:"initial_context_length"`
SlidingWindow uint32 `json:"sliding_window"` RopeTheta float32 `json:"rope_theta"`
RopeScalingFactor float32 `json:"rope_scaling_factor"`
RopeScaling struct {
Factor float32 `json:"factor"`
} `json:"rope_scaling"`
SlidingWindow uint32 `json:"sliding_window"`
} }
var _ ModelConverter = (*gptossModel)(nil) var _ ModelConverter = (*gptossModel)(nil)
@@ -36,11 +41,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
kv := m.ModelParameters.KV(t) kv := m.ModelParameters.KV(t)
kv["general.architecture"] = "gptoss" kv["general.architecture"] = "gptoss"
kv["general.file_type"] = uint32(4) kv["general.file_type"] = uint32(4)
kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength)) kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
kv["gptoss.block_count"] = m.HiddenLayers kv["gptoss.block_count"] = m.HiddenLayers
kv["gptoss.embedding_length"] = m.HiddenSize kv["gptoss.embedding_length"] = m.HiddenSize
kv["gptoss.feed_forward_length"] = m.IntermediateSize kv["gptoss.feed_forward_length"] = m.IntermediateSize
kv["gptoss.expert_count"] = m.Experts kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
kv["gptoss.expert_used_count"] = m.ExpertsPerToken kv["gptoss.expert_used_count"] = m.ExpertsPerToken
kv["gptoss.attention.head_count"] = m.AttentionHeads kv["gptoss.attention.head_count"] = m.AttentionHeads
kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -49,7 +54,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5) kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
kv["gptoss.attention.sliding_window"] = m.SlidingWindow kv["gptoss.attention.sliding_window"] = m.SlidingWindow
kv["gptoss.rope.freq_base"] = m.RopeTheta kv["gptoss.rope.freq_base"] = m.RopeTheta
kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|> kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
kv["tokenizer.ggml.add_bos_token"] = false kv["tokenizer.ggml.add_bos_token"] = false
@@ -92,6 +97,11 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
for name, mxfp4 := range mxfp4s { for name, mxfp4 := range mxfp4s {
dims := mxfp4.blocks.Shape() dims := mxfp4.blocks.Shape()
if !strings.HasSuffix(name, ".weight") {
name += ".weight"
}
out = append(out, &ggml.Tensor{ out = append(out, &ggml.Tensor{
Name: name, Name: name,
Kind: uint32(ggml.TensorTypeMXFP4), Kind: uint32(ggml.TensorTypeMXFP4),
@@ -104,25 +114,47 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
} }
func (m *gptossModel) Replacements() []string { func (m *gptossModel) Replacements() []string {
return []string{ var replacements []string
// noop replacements so other replacements will not be applied if m.MaxPositionEmbeddings > 0 {
".blocks", ".blocks", // hf flavored model
".scales", ".scales", replacements = []string{
// real replacements "lm_head", "output",
"block", "blk", "model.embed_tokens", "token_embd",
"attn.norm", "attn_norm", "model.layers", "blk",
"attn.qkv", "attn_qkv", "input_layernorm", "attn_norm",
"attn.sinks", "attn_sinks", "self_attn.q_proj", "attn_q",
"attn.out", "attn_out", "self_attn.k_proj", "attn_k",
"mlp.norm", "ffn_norm", "self_attn.v_proj", "attn_v",
"mlp.gate", "ffn_gate_inp", "self_attn.o_proj", "attn_out",
"mlp.mlp1_", "ffn_gate_up_exps.", "self_attn.sinks", "attn_sinks",
"mlp.mlp2_", "ffn_down_exps.", "post_attention_layernorm", "ffn_norm",
"embedding", "token_embd", "mlp.router", "ffn_gate_inp",
"norm", "output_norm", "mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
"unembedding", "output", "mlp.experts.down_proj_", "ffn_down_exps.",
"scale", "weight", "model.norm", "output_norm",
}
} else {
replacements = []string{
// noop replacements so other replacements will not be applied
".blocks", ".blocks",
".scales", ".scales",
// real replacements
"block", "blk",
"attn.norm", "attn_norm",
"attn.qkv", "attn_qkv",
"attn.sinks", "attn_sinks",
"attn.out", "attn_out",
"mlp.norm", "ffn_norm",
"mlp.gate", "ffn_gate_inp",
"mlp.mlp1_", "ffn_gate_up_exps.",
"mlp.mlp2_", "ffn_down_exps.",
"embedding", "token_embd",
"norm", "output_norm",
"unembedding", "output",
"scale", "weight",
}
} }
return replacements
} }
type mxfp4 struct { type mxfp4 struct {