mirror of
https://github.com/ollama/ollama.git
synced 2025-11-12 00:37:41 +01:00
Update GGML to b6646 (#12245)
Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
199
llama/llama.cpp/include/llama.h
vendored
199
llama/llama.cpp/include/llama.h
vendored
@@ -64,8 +64,6 @@ extern "C" {
|
||||
|
||||
typedef struct llama_memory_i * llama_memory_t;
|
||||
|
||||
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
||||
|
||||
typedef int32_t llama_pos;
|
||||
typedef int32_t llama_token;
|
||||
typedef int32_t llama_seq_id;
|
||||
@@ -181,6 +179,14 @@ extern "C" {
|
||||
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
||||
};
|
||||
|
||||
enum llama_flash_attn_type {
|
||||
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
|
||||
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
|
||||
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
|
||||
};
|
||||
|
||||
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
||||
@@ -200,7 +206,7 @@ extern "C" {
|
||||
llama_token_data * data;
|
||||
size_t size;
|
||||
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
||||
bool sorted;
|
||||
bool sorted; // note: do not assume the data is sorted - always check this flag
|
||||
} llama_token_data_array;
|
||||
|
||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||
@@ -305,6 +311,7 @@ extern "C" {
|
||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
enum llama_attention_type attention_type; // attention type to use for embeddings
|
||||
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
|
||||
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
@@ -314,7 +321,7 @@ extern "C" {
|
||||
float yarn_beta_fast; // YaRN low correction dim
|
||||
float yarn_beta_slow; // YaRN high correction dim
|
||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
||||
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
@@ -331,7 +338,6 @@ extern "C" {
|
||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // measure performance timings
|
||||
bool op_offload; // offload host tensor operations to device
|
||||
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
@@ -469,8 +475,6 @@ extern "C" {
|
||||
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
||||
|
||||
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
||||
|
||||
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
||||
|
||||
@@ -557,10 +561,32 @@ extern "C" {
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
|
||||
// Functions to access the adapter's GGUF metadata scalar values
|
||||
// - The functions return the length of the string on success, or -1 on failure
|
||||
// - The output string is always null-terminated and cleared on failure
|
||||
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
||||
// - GGUF array values are not supported by these functions
|
||||
|
||||
// Get metadata value as a string by key name
|
||||
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
|
||||
|
||||
// Get the number of metadata key/value pairs
|
||||
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
|
||||
|
||||
// Get metadata key name by index
|
||||
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||
|
||||
// Get metadata value as a string by index
|
||||
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// Note: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
|
||||
|
||||
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
||||
|
||||
// Add a loaded LoRA adapter to given context
|
||||
@@ -667,111 +693,6 @@ extern "C" {
|
||||
// Check if the memory supports shifting
|
||||
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
||||
|
||||
//
|
||||
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
||||
//
|
||||
|
||||
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
||||
|
||||
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
||||
|
||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
||||
struct llama_context * ctx),
|
||||
"Use llama_memory_clear() instead");
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||
// seq_id < 0 : match any sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1),
|
||||
"Use llama_memory_seq_rm() instead");
|
||||
|
||||
// Copy all tokens that belong to the specified sequence to another sequence
|
||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id_src,
|
||||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1),
|
||||
"Use llama_memory_seq_cp() instead");
|
||||
|
||||
// Removes all tokens that do not belong to the specified sequence
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id),
|
||||
"Use llama_memory_seq_keep() instead");
|
||||
|
||||
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta),
|
||||
"Use llama_memory_seq_add() instead");
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d),
|
||||
"Use llama_memory_seq_div() instead");
|
||||
|
||||
// Returns the smallest position present in the KV cache for the specified sequence
|
||||
// This is typically non-zero only for SWA caches
|
||||
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
||||
// Return -1 if the sequence is empty
|
||||
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id),
|
||||
"Use llama_memory_seq_pos_min() instead");
|
||||
|
||||
// Returns the largest position present in the KV cache for the specified sequence
|
||||
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
||||
// Return -1 if the sequence is empty
|
||||
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id),
|
||||
"Use llama_memory_seq_pos_max() instead");
|
||||
|
||||
// Defragment the KV cache
|
||||
// This will be applied:
|
||||
// - lazily on next llama_decode()
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
||||
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
||||
|
||||
// Check if the context supports KV cache shifting
|
||||
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
||||
"use llama_memory_can_shift() instead");
|
||||
|
||||
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
||||
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
@@ -870,6 +791,29 @@ extern "C" {
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
|
||||
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
||||
|
||||
typedef uint32_t llama_state_seq_flags;
|
||||
|
||||
LLAMA_API size_t llama_state_seq_get_size_ext(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_get_data_ext(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst,
|
||||
size_t size,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_set_data_ext(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src,
|
||||
size_t size,
|
||||
llama_seq_id dest_seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
//
|
||||
// Decoding
|
||||
//
|
||||
@@ -1216,11 +1160,6 @@ extern "C" {
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
||||
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
||||
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
||||
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
||||
|
||||
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
||||
/// Setting k <= 0 makes this a noop
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
||||
@@ -1390,24 +1329,25 @@ extern "C" {
|
||||
//
|
||||
// Performance utils
|
||||
//
|
||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||
//
|
||||
|
||||
struct llama_perf_context_data {
|
||||
double t_start_ms;
|
||||
double t_load_ms;
|
||||
double t_p_eval_ms;
|
||||
double t_eval_ms;
|
||||
// ms == milliseconds
|
||||
double t_start_ms; // absolute start time
|
||||
double t_load_ms; // time needed for loading the model
|
||||
double t_p_eval_ms; // time needed for processing the prompt
|
||||
double t_eval_ms; // time needed for generating tokens
|
||||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
int32_t n_reused; // number of times a ggml compute graph had been reused
|
||||
int32_t n_p_eval; // number of prompt tokens
|
||||
int32_t n_eval; // number of generated tokens
|
||||
int32_t n_reused; // number of times a ggml compute graph had been reused
|
||||
};
|
||||
|
||||
struct llama_perf_sampler_data {
|
||||
double t_sample_ms;
|
||||
double t_sample_ms; // time needed for sampling in ms
|
||||
|
||||
int32_t n_sample;
|
||||
int32_t n_sample; // number of sampled tokens
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||
@@ -1419,6 +1359,9 @@ extern "C" {
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
// print a breakdown of per-device memory use via LLAMA_LOG:
|
||||
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
@@ -1437,6 +1380,8 @@ extern "C" {
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
enum ggml_opt_optimizer_type optimizer_type;
|
||||
};
|
||||
|
||||
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
Reference in New Issue
Block a user