/** * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file * * MIT License * * Copyright (c) 2023-2024 The ggml authors * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "llama.h" #include "llama-arch.h" #include "llama-hparams.h" #include "llama-vocab.h" #include "llama-mmap.h" #include "ggml-cpp.h" #include #include // available models // TODO: this enum does not follow the enum naming convention enum llm_type { MODEL_UNKNOWN, MODEL_14M, MODEL_17M, MODEL_22M, MODEL_33M, MODEL_60M, MODEL_70M, MODEL_80M, MODEL_109M, MODEL_137M, MODEL_160M, MODEL_220M, MODEL_250M, MODEL_270M, MODEL_335M, MODEL_410M, MODEL_450M, MODEL_770M, MODEL_780M, MODEL_0_5B, MODEL_1B, MODEL_1_3B, MODEL_1_4B, MODEL_1_5B, MODEL_1_6B, MODEL_2B, MODEL_2_8B, MODEL_3B, MODEL_4B, MODEL_6B, MODEL_6_9B, MODEL_7B, MODEL_8B, MODEL_9B, MODEL_11B, MODEL_12B, MODEL_13B, MODEL_14B, MODEL_15B, MODEL_16B, MODEL_20B, MODEL_22B, MODEL_30B, MODEL_32B, MODEL_34B, MODEL_35B, MODEL_40B, MODEL_65B, MODEL_70B, MODEL_90B, MODEL_236B, MODEL_314B, MODEL_671B, MODEL_SMALL, MODEL_MEDIUM, MODEL_LARGE, MODEL_XL, MODEL_A1_7B, MODEL_A2_7B, MODEL_8x7B, MODEL_8x22B, MODEL_16x12B, MODEL_10B_128x3_66B, MODEL_57B_A14B, MODEL_27B, }; struct llama_layer_posnet { // resnet struct ggml_tensor * norm1 = nullptr; struct ggml_tensor * norm1_b = nullptr; struct ggml_tensor * conv1 = nullptr; struct ggml_tensor * conv1_b = nullptr; struct ggml_tensor * norm2 = nullptr; struct ggml_tensor * norm2_b = nullptr; struct ggml_tensor * conv2 = nullptr; struct ggml_tensor * conv2_b = nullptr; // attention struct ggml_tensor * attn_norm = nullptr; struct ggml_tensor * attn_norm_b = nullptr; struct ggml_tensor * attn_q = nullptr; struct ggml_tensor * attn_q_b = nullptr; struct ggml_tensor * attn_k = nullptr; struct ggml_tensor * attn_k_b = nullptr; struct ggml_tensor * attn_v = nullptr; struct ggml_tensor * attn_v_b = nullptr; struct ggml_tensor * attn_o = nullptr; struct ggml_tensor * attn_o_b = nullptr; // normalize struct ggml_tensor * norm = nullptr; struct ggml_tensor * norm_b = nullptr; }; struct llama_layer_convnext { struct ggml_tensor * dw = nullptr; struct ggml_tensor * dw_b = nullptr; struct ggml_tensor * norm = nullptr; struct ggml_tensor * norm_b = nullptr; struct ggml_tensor * pw1 = nullptr; struct ggml_tensor * pw1_b = nullptr; struct ggml_tensor * pw2 = nullptr; struct ggml_tensor * pw2_b = nullptr; struct ggml_tensor * gamma = nullptr; }; struct llama_layer { // normalization struct ggml_tensor * attn_norm = nullptr; struct ggml_tensor * attn_norm_b = nullptr; struct ggml_tensor * attn_norm_2 = nullptr; struct ggml_tensor * attn_norm_2_b = nullptr; struct ggml_tensor * attn_q_norm = nullptr; struct ggml_tensor * attn_q_norm_b = nullptr; struct ggml_tensor * attn_k_norm = nullptr; struct ggml_tensor * attn_k_norm_b = nullptr; struct ggml_tensor * attn_out_norm = nullptr; struct ggml_tensor * attn_out_norm_b = nullptr; struct ggml_tensor * attn_q_a_norm = nullptr; struct ggml_tensor * attn_kv_a_norm = nullptr; struct ggml_tensor * attn_sub_norm = nullptr; struct ggml_tensor * attn_post_norm = nullptr; struct ggml_tensor * ffn_sub_norm = nullptr; struct ggml_tensor * attn_norm_cross = nullptr; struct ggml_tensor * attn_norm_enc = nullptr; // attention struct ggml_tensor * wq = nullptr; struct ggml_tensor * wk = nullptr; struct ggml_tensor * wv = nullptr; struct ggml_tensor * wo = nullptr; struct ggml_tensor * wqkv = nullptr; struct ggml_tensor * wq_a = nullptr; struct ggml_tensor * wq_b = nullptr; struct ggml_tensor * wkv_a_mqa = nullptr; struct ggml_tensor * wkv_b = nullptr; struct ggml_tensor * wq_cross = nullptr; struct ggml_tensor * wk_cross = nullptr; struct ggml_tensor * wv_cross = nullptr; struct ggml_tensor * wo_cross = nullptr; struct ggml_tensor * wq_enc = nullptr; struct ggml_tensor * wk_enc = nullptr; struct ggml_tensor * wv_enc = nullptr; struct ggml_tensor * wo_enc = nullptr; // attention bias struct ggml_tensor * bq = nullptr; struct ggml_tensor * bk = nullptr; struct ggml_tensor * bv = nullptr; struct ggml_tensor * bo = nullptr; struct ggml_tensor * bqkv = nullptr; // relative position bias struct ggml_tensor * attn_rel_b = nullptr; struct ggml_tensor * attn_rel_b_enc = nullptr; struct ggml_tensor * attn_rel_b_cross = nullptr; // normalization struct ggml_tensor * ffn_norm = nullptr; struct ggml_tensor * ffn_norm_b = nullptr; struct ggml_tensor * ffn_post_norm = nullptr; struct ggml_tensor * layer_out_norm = nullptr; struct ggml_tensor * layer_out_norm_b = nullptr; struct ggml_tensor * ffn_norm_exps = nullptr; struct ggml_tensor * ffn_norm_enc = nullptr; // ff struct ggml_tensor * ffn_gate = nullptr; // w1 struct ggml_tensor * ffn_down = nullptr; // w2 struct ggml_tensor * ffn_up = nullptr; // w3 struct ggml_tensor * ffn_gate_enc = nullptr; struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; struct ggml_tensor * ffn_gate_exps = nullptr; struct ggml_tensor * ffn_down_exps = nullptr; struct ggml_tensor * ffn_up_exps = nullptr; // ff shared expert (shexp) struct ggml_tensor * ffn_gate_inp_shexp = nullptr; struct ggml_tensor * ffn_gate_shexp = nullptr; struct ggml_tensor * ffn_down_shexp = nullptr; struct ggml_tensor * ffn_up_shexp = nullptr; // ff bias struct ggml_tensor * ffn_gate_b = nullptr; struct ggml_tensor * ffn_down_b = nullptr; // b2 struct ggml_tensor * ffn_up_b = nullptr; // b3 struct ggml_tensor * ffn_act = nullptr; struct ggml_tensor * ffn_exp_probs_b = nullptr; // mamba proj struct ggml_tensor * ssm_in = nullptr; struct ggml_tensor * ssm_x = nullptr; struct ggml_tensor * ssm_dt = nullptr; struct ggml_tensor * ssm_out = nullptr; // mamba struct ggml_tensor * ssm_conv1d = nullptr; struct ggml_tensor * ssm_a = nullptr; struct ggml_tensor * ssm_d = nullptr; // mamba bias struct ggml_tensor * ssm_conv1d_b = nullptr; struct ggml_tensor * ssm_dt_b = nullptr; // rwkv struct ggml_tensor * time_mix_w1 = nullptr; struct ggml_tensor * time_mix_w2 = nullptr; struct ggml_tensor * time_mix_lerp_x = nullptr; struct ggml_tensor * time_mix_lerp_w = nullptr; struct ggml_tensor * time_mix_lerp_k = nullptr; struct ggml_tensor * time_mix_lerp_v = nullptr; struct ggml_tensor * time_mix_lerp_r = nullptr; struct ggml_tensor * time_mix_lerp_g = nullptr; struct ggml_tensor * time_mix_first = nullptr; struct ggml_tensor * time_mix_decay = nullptr; struct ggml_tensor * time_mix_decay_w1 = nullptr; struct ggml_tensor * time_mix_decay_w2 = nullptr; struct ggml_tensor * time_mix_key = nullptr; struct ggml_tensor * time_mix_value = nullptr; struct ggml_tensor * time_mix_receptance = nullptr; struct ggml_tensor * time_mix_gate = nullptr; struct ggml_tensor * time_mix_ln = nullptr; struct ggml_tensor * time_mix_ln_b = nullptr; struct ggml_tensor * time_mix_output = nullptr; struct ggml_tensor * channel_mix_lerp_k = nullptr; struct ggml_tensor * channel_mix_lerp_r = nullptr; struct ggml_tensor * channel_mix_key = nullptr; struct ggml_tensor * channel_mix_receptance = nullptr; struct ggml_tensor * channel_mix_value = nullptr; // long rope factors struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_short = nullptr; struct ggml_tensor * rope_freqs = nullptr; // bitnet scale struct ggml_tensor * wq_scale = nullptr; struct ggml_tensor * wk_scale = nullptr; struct ggml_tensor * wv_scale = nullptr; struct ggml_tensor * wo_scale = nullptr; struct ggml_tensor * ffn_gate_scale = nullptr; struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr; struct ggml_tensor * bskcn_tv = nullptr; // cross attention struct ggml_tensor * cross_attn_k_norm = nullptr; struct ggml_tensor * cross_attn_k_proj = nullptr; struct ggml_tensor * cross_attn_o_proj = nullptr; struct ggml_tensor * cross_attn_q_norm = nullptr; struct ggml_tensor * cross_attn_q_proj = nullptr; struct ggml_tensor * cross_attn_v_proj = nullptr; struct ggml_tensor * cross_attn_attn_gate = nullptr; struct ggml_tensor * cross_attn_mlp_gate = nullptr; struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; }; struct llama_model { llm_type type = MODEL_UNKNOWN; llm_arch arch = LLM_ARCH_UNKNOWN; llama_ftype ftype = LLAMA_FTYPE_ALL_F32; std::string name = "n/a"; llama_hparams hparams = {}; llama_vocab vocab; struct ggml_tensor * tok_embd = nullptr; struct ggml_tensor * type_embd = nullptr; struct ggml_tensor * pos_embd = nullptr; struct ggml_tensor * tok_norm = nullptr; struct ggml_tensor * tok_norm_b = nullptr; struct ggml_tensor * output_norm = nullptr; struct ggml_tensor * output_norm_b = nullptr; struct ggml_tensor * output = nullptr; struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; struct ggml_tensor * conv1d = nullptr; struct ggml_tensor * conv1d_b = nullptr; std::vector layers; // gguf metadata std::unordered_map gguf_kv; llama_split_mode split_mode; int main_gpu; int n_gpu_layers; std::vector rpc_servers; // list of devices used in this model std::vector devices; // lists of buffer types used for each layer using buft_list_t = std::vector>; buft_list_t cpu_buft_list; std::map gpu_buft_list; struct layer_dev { ggml_backend_dev_t dev; buft_list_t * buft_list; }; layer_dev dev_input = {}; layer_dev dev_output = {}; std::vector dev_layer; // contexts where the model tensors metadata is stored std::vector ctxs; // the model memory buffers for the tensor data std::vector bufs; // model memory mapped files llama_mmaps mappings; // objects representing data potentially being locked in memory llama_mlocks mlock_bufs; llama_mlocks mlock_mmaps; // for quantize-stats only std::vector> tensors_by_name; int64_t t_load_us = 0; int64_t t_start_us = 0; // total number of parameters in the model uint64_t n_elements = 0; // total size of all the tensors in the model in bytes size_t n_bytes = 0; }; const char * llm_type_name(llm_type type); std::string llama_model_arch_name (const llama_model & model); std::string llama_model_type_name (const llama_model & model); std::string llama_model_ftype_name(const llama_model & model); template bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) { ggml_init_params params = { /*.mem_size =*/ ggml_tensor_overhead()*8, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; ggml_context_ptr ctx { ggml_init(params) }; if (!ctx) { throw std::runtime_error("failed to create ggml context"); } ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) }; ggml_tensor * op_tensor = fn(ctx.get()); for (int i = 0; i < GGML_MAX_SRC; i++) { if (op_tensor->src[i] != nullptr) { op_tensor->src[i]->buffer = buf.get(); } } bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor); return op_supported; } template ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) { for (const auto & cur : buft_list) { ggml_backend_dev_t cur_dev = cur.first; ggml_backend_buffer_type_t cur_buft = cur.second; if (buft_supported(cur_buft, cur_dev, fn)) { return cur_buft; } } throw std::runtime_error("no suitable buffer type found"); } // used by llama_adapter_cvec ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il); // used by llama_adapter_lora struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name); size_t llama_model_max_nodes(const llama_model & model); struct llama_model_loader; // TODO: become llama_model methods void llm_load_stats (llama_model_loader & ml, llama_model & model); void llm_load_arch (llama_model_loader & ml, llama_model & model); void llm_load_hparams (llama_model_loader & ml, llama_model & model); void llm_load_vocab (llama_model_loader & ml, llama_model & model); void llm_load_print_meta(llama_model_loader & ml, llama_model & model);