mirror of
https://github.com/ollama/ollama.git
synced 2025-08-23 20:33:27 +02:00
gpt-oss: convert from hugging face format (#11907)
This commit is contained in:
@@ -15,19 +15,24 @@ import (
|
|||||||
|
|
||||||
type gptossModel struct {
|
type gptossModel struct {
|
||||||
ModelParameters
|
ModelParameters
|
||||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
IntermediateSize uint32 `json:"intermediate_size"`
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
AttentionHeads uint32 `json:"num_attention_heads"`
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
KeyValueHeads uint32 `json:"num_key_value_heads"`
|
AttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
HeadDim uint32 `json:"head_dim"`
|
KeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
Experts uint32 `json:"num_experts"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
ExpertsPerToken uint32 `json:"experts_per_token"`
|
Experts uint32 `json:"num_experts"`
|
||||||
RMSNormEpsilon float32 `json:"rms_norm_eps"`
|
LocalExperts uint32 `json:"num_local_experts"`
|
||||||
InitialContextLength uint32 `json:"initial_context_length"`
|
ExpertsPerToken uint32 `json:"experts_per_token"`
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
RMSNormEpsilon float32 `json:"rms_norm_eps"`
|
||||||
RopeScalingFactor float32 `json:"rope_scaling_factor"`
|
InitialContextLength uint32 `json:"initial_context_length"`
|
||||||
SlidingWindow uint32 `json:"sliding_window"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
|
RopeScalingFactor float32 `json:"rope_scaling_factor"`
|
||||||
|
RopeScaling struct {
|
||||||
|
Factor float32 `json:"factor"`
|
||||||
|
} `json:"rope_scaling"`
|
||||||
|
SlidingWindow uint32 `json:"sliding_window"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ ModelConverter = (*gptossModel)(nil)
|
var _ ModelConverter = (*gptossModel)(nil)
|
||||||
@@ -36,11 +41,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv := m.ModelParameters.KV(t)
|
kv := m.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gptoss"
|
kv["general.architecture"] = "gptoss"
|
||||||
kv["general.file_type"] = uint32(4)
|
kv["general.file_type"] = uint32(4)
|
||||||
kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
|
kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
|
||||||
kv["gptoss.block_count"] = m.HiddenLayers
|
kv["gptoss.block_count"] = m.HiddenLayers
|
||||||
kv["gptoss.embedding_length"] = m.HiddenSize
|
kv["gptoss.embedding_length"] = m.HiddenSize
|
||||||
kv["gptoss.feed_forward_length"] = m.IntermediateSize
|
kv["gptoss.feed_forward_length"] = m.IntermediateSize
|
||||||
kv["gptoss.expert_count"] = m.Experts
|
kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
|
||||||
kv["gptoss.expert_used_count"] = m.ExpertsPerToken
|
kv["gptoss.expert_used_count"] = m.ExpertsPerToken
|
||||||
kv["gptoss.attention.head_count"] = m.AttentionHeads
|
kv["gptoss.attention.head_count"] = m.AttentionHeads
|
||||||
kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
|
kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
|
||||||
@@ -49,7 +54,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
|
|||||||
kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
|
kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
|
||||||
kv["gptoss.attention.sliding_window"] = m.SlidingWindow
|
kv["gptoss.attention.sliding_window"] = m.SlidingWindow
|
||||||
kv["gptoss.rope.freq_base"] = m.RopeTheta
|
kv["gptoss.rope.freq_base"] = m.RopeTheta
|
||||||
kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
|
kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
|
||||||
kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
|
kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
|
||||||
kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
|
kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
|
||||||
kv["tokenizer.ggml.add_bos_token"] = false
|
kv["tokenizer.ggml.add_bos_token"] = false
|
||||||
@@ -92,6 +97,11 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
|
|
||||||
for name, mxfp4 := range mxfp4s {
|
for name, mxfp4 := range mxfp4s {
|
||||||
dims := mxfp4.blocks.Shape()
|
dims := mxfp4.blocks.Shape()
|
||||||
|
|
||||||
|
if !strings.HasSuffix(name, ".weight") {
|
||||||
|
name += ".weight"
|
||||||
|
}
|
||||||
|
|
||||||
out = append(out, &ggml.Tensor{
|
out = append(out, &ggml.Tensor{
|
||||||
Name: name,
|
Name: name,
|
||||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||||
@@ -104,25 +114,47 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *gptossModel) Replacements() []string {
|
func (m *gptossModel) Replacements() []string {
|
||||||
return []string{
|
var replacements []string
|
||||||
// noop replacements so other replacements will not be applied
|
if m.MaxPositionEmbeddings > 0 {
|
||||||
".blocks", ".blocks",
|
// hf flavored model
|
||||||
".scales", ".scales",
|
replacements = []string{
|
||||||
// real replacements
|
"lm_head", "output",
|
||||||
"block", "blk",
|
"model.embed_tokens", "token_embd",
|
||||||
"attn.norm", "attn_norm",
|
"model.layers", "blk",
|
||||||
"attn.qkv", "attn_qkv",
|
"input_layernorm", "attn_norm",
|
||||||
"attn.sinks", "attn_sinks",
|
"self_attn.q_proj", "attn_q",
|
||||||
"attn.out", "attn_out",
|
"self_attn.k_proj", "attn_k",
|
||||||
"mlp.norm", "ffn_norm",
|
"self_attn.v_proj", "attn_v",
|
||||||
"mlp.gate", "ffn_gate_inp",
|
"self_attn.o_proj", "attn_out",
|
||||||
"mlp.mlp1_", "ffn_gate_up_exps.",
|
"self_attn.sinks", "attn_sinks",
|
||||||
"mlp.mlp2_", "ffn_down_exps.",
|
"post_attention_layernorm", "ffn_norm",
|
||||||
"embedding", "token_embd",
|
"mlp.router", "ffn_gate_inp",
|
||||||
"norm", "output_norm",
|
"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
|
||||||
"unembedding", "output",
|
"mlp.experts.down_proj_", "ffn_down_exps.",
|
||||||
"scale", "weight",
|
"model.norm", "output_norm",
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
replacements = []string{
|
||||||
|
// noop replacements so other replacements will not be applied
|
||||||
|
".blocks", ".blocks",
|
||||||
|
".scales", ".scales",
|
||||||
|
// real replacements
|
||||||
|
"block", "blk",
|
||||||
|
"attn.norm", "attn_norm",
|
||||||
|
"attn.qkv", "attn_qkv",
|
||||||
|
"attn.sinks", "attn_sinks",
|
||||||
|
"attn.out", "attn_out",
|
||||||
|
"mlp.norm", "ffn_norm",
|
||||||
|
"mlp.gate", "ffn_gate_inp",
|
||||||
|
"mlp.mlp1_", "ffn_gate_up_exps.",
|
||||||
|
"mlp.mlp2_", "ffn_down_exps.",
|
||||||
|
"embedding", "token_embd",
|
||||||
|
"norm", "output_norm",
|
||||||
|
"unembedding", "output",
|
||||||
|
"scale", "weight",
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return replacements
|
||||||
}
|
}
|
||||||
|
|
||||||
type mxfp4 struct {
|
type mxfp4 struct {
|
||||||
|
Reference in New Issue
Block a user