From 394b69dece6a7c7e26e31f5f3ad05cf71b5f66f5 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 2 Apr 2025 12:52:09 -0700 Subject: [PATCH] mistral3 quantization --- llama/llama.cpp/src/llama-arch.cpp | 17 +++ llama/llama.cpp/src/llama-arch.h | 1 + llama/llama.cpp/src/llama-model.cpp | 3 + llama/llama.cpp/src/llama-quant.cpp | 9 +- ...tch => 0021-add-model-quantizations.patch} | 102 ++++++++++++++---- 5 files changed, 104 insertions(+), 28 deletions(-) rename llama/patches/{0021-gemma3-quantization.patch => 0021-add-model-quantizations.patch} (52%) diff --git a/llama/llama.cpp/src/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp index b443fcd3f..13a0a9888 100644 --- a/llama/llama.cpp/src/llama-arch.cpp +++ b/llama/llama.cpp/src/llama-arch.cpp @@ -65,6 +65,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_SOLAR, "solar" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, + { LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1371,6 +1372,22 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, }, }, + { + LLM_ARCH_MISTRAL3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + } + }, { LLM_ARCH_UNKNOWN, { diff --git a/llama/llama.cpp/src/llama-arch.h b/llama/llama.cpp/src/llama-arch.h index aad92a5d2..8476ae0a1 100644 --- a/llama/llama.cpp/src/llama-arch.h +++ b/llama/llama.cpp/src/llama-arch.h @@ -69,6 +69,7 @@ enum llm_arch { LLM_ARCH_CHAMELEON, LLM_ARCH_SOLAR, LLM_ARCH_WAVTOKENIZER_DEC, + LLM_ARCH_MISTRAL3, LLM_ARCH_UNKNOWN, }; diff --git a/llama/llama.cpp/src/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp index 701830418..db4f2685d 100644 --- a/llama/llama.cpp/src/llama-model.cpp +++ b/llama/llama.cpp/src/llama-model.cpp @@ -1277,6 +1277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); } break; + case LLM_ARCH_MISTRAL3: break; default: throw std::runtime_error("unsupported model architecture"); } @@ -3537,6 +3538,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); } break; + case LLM_ARCH_MISTRAL3: break; default: throw std::runtime_error("unknown architecture"); } @@ -4015,6 +4017,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_CHAMELEON: case LLM_ARCH_SOLAR: + case LLM_ARCH_MISTRAL3: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/llama/llama.cpp/src/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp index d2f3a5108..ebcbafa1c 100644 --- a/llama/llama.cpp/src/llama-quant.cpp +++ b/llama/llama.cpp/src/llama-quant.cpp @@ -738,13 +738,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? // don't quantize vision stuff - quantize &= name.find("v.blk.") == std::string::npos; - - quantize &= name.find("mm.mm_input_projection.weight") == std::string::npos; - quantize &= name.find("mm.mm_soft_emb_norm.weight") == std::string::npos; - quantize &= name.find("v.patch_embedding.weight") == std::string::npos; - quantize &= name.find("v.position_embedding.weight") == std::string::npos; - quantize &= name.find("v.post_layernorm.weight") == std::string::npos; + quantize &= name.find("v.") == std::string::npos; + quantize &= name.find("mm.") == std::string::npos; // quantize only 2D and 3D tensors (experts) quantize &= (ggml_n_dims(tensor) >= 2); diff --git a/llama/patches/0021-gemma3-quantization.patch b/llama/patches/0021-add-model-quantizations.patch similarity index 52% rename from llama/patches/0021-gemma3-quantization.patch rename to llama/patches/0021-add-model-quantizations.patch index 4f6dbc11b..cdc35a412 100644 --- a/llama/patches/0021-gemma3-quantization.patch +++ b/llama/patches/0021-add-model-quantizations.patch @@ -1,17 +1,19 @@ From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Fri, 14 Mar 2025 16:33:23 -0700 -Subject: [PATCH] gemma3 quantization +Subject: [PATCH] add model quantizations +- gemma3 +- mistral3 --- - src/llama-arch.cpp | 19 +++++++++++++++++++ - src/llama-arch.h | 1 + - src/llama-model.cpp | 7 +++++++ - src/llama-quant.cpp | 9 +++++++++ - 4 files changed, 36 insertions(+) + src/llama-arch.cpp | 36 ++++++++++++++++++++++++++++++++++++ + src/llama-arch.h | 2 ++ + src/llama-model.cpp | 10 ++++++++++ + src/llama-quant.cpp | 4 ++++ + 4 files changed, 52 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp -index b6f20286..b443fcd3 100644 +index b6f20286..13a0a988 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -37,6 +37,7 @@ static const std::map LLM_ARCH_NAMES = { @@ -22,7 +24,15 @@ index b6f20286..b443fcd3 100644 { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, -@@ -804,6 +805,24 @@ static const std::map> LLM_TENSOR_N +@@ -64,6 +65,7 @@ static const std::map LLM_ARCH_NAMES = { + { LLM_ARCH_CHAMELEON, "chameleon" }, + { LLM_ARCH_SOLAR, "solar" }, + { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, ++ { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, + }; + +@@ -804,6 +806,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, }, }, @@ -47,8 +57,31 @@ index b6f20286..b443fcd3 100644 { LLM_ARCH_STARCODER2, { +@@ -1352,6 +1372,22 @@ static const std::map> LLM_TENSOR_N + { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, + }, + }, ++ { ++ LLM_ARCH_MISTRAL3, ++ { ++ { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, ++ { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, ++ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, ++ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, ++ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, ++ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, ++ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, ++ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, ++ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, ++ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, ++ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, ++ } ++ }, + { + LLM_ARCH_UNKNOWN, + { diff --git a/src/llama-arch.h b/src/llama-arch.h -index ec742224..aad92a5d 100644 +index ec742224..8476ae0a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -41,6 +41,7 @@ enum llm_arch { @@ -59,8 +92,16 @@ index ec742224..aad92a5d 100644 LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, +@@ -68,6 +69,7 @@ enum llm_arch { + LLM_ARCH_CHAMELEON, + LLM_ARCH_SOLAR, + LLM_ARCH_WAVTOKENIZER_DEC, ++ LLM_ARCH_MISTRAL3, + LLM_ARCH_UNKNOWN, + }; + diff --git a/src/llama-model.cpp b/src/llama-model.cpp -index ab1a07d1..70183041 100644 +index ab1a07d1..db4f2685 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -878,6 +878,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { @@ -73,7 +114,15 @@ index ab1a07d1..70183041 100644 case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -@@ -2537,6 +2540,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { +@@ -1274,6 +1277,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + } break; ++ case LLM_ARCH_MISTRAL3: break; + default: throw std::runtime_error("unsupported model architecture"); + } + +@@ -2537,6 +2541,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); } } break; @@ -83,7 +132,23 @@ index ab1a07d1..70183041 100644 case LLM_ARCH_STARCODER2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); -@@ -4029,6 +4035,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { +@@ -3531,6 +3538,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); + output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); + } break; ++ case LLM_ARCH_MISTRAL3: break; + default: + throw std::runtime_error("unknown architecture"); + } +@@ -4009,6 +4017,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { + case LLM_ARCH_GRANITE_MOE: + case LLM_ARCH_CHAMELEON: + case LLM_ARCH_SOLAR: ++ case LLM_ARCH_MISTRAL3: + return LLAMA_ROPE_TYPE_NORM; + + // the pairs of head values are offset by n_rot/2 +@@ -4029,6 +4038,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { case LLM_ARCH_PHIMOE: case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: @@ -92,21 +157,16 @@ index ab1a07d1..70183041 100644 case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp -index 6eb1da08..d2f3a510 100644 +index 6eb1da08..ebcbafa1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp -@@ -737,6 +737,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: +@@ -737,6 +737,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + // don't quantize vision stuff -+ quantize &= name.find("v.blk.") == std::string::npos; -+ -+ quantize &= name.find("mm.mm_input_projection.weight") == std::string::npos; -+ quantize &= name.find("mm.mm_soft_emb_norm.weight") == std::string::npos; -+ quantize &= name.find("v.patch_embedding.weight") == std::string::npos; -+ quantize &= name.find("v.position_embedding.weight") == std::string::npos; -+ quantize &= name.find("v.post_layernorm.weight") == std::string::npos; ++ quantize &= name.find("v.") == std::string::npos; ++ quantize &= name.find("mm.") == std::string::npos; + // quantize only 2D and 3D tensors (experts) quantize &= (ggml_n_dims(tensor) >= 2);