mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 23:07:55 +01:00
Update GGML to b6646 (#12245)
Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
36
llama/llama.cpp/src/llama-context.h
vendored
36
llama/llama.cpp/src/llama-context.h
vendored
@@ -17,9 +17,17 @@ class llama_batch_allocr;
|
||||
class llama_io_read_i;
|
||||
class llama_io_write_i;
|
||||
|
||||
// "memory" as in abstract memory for the context
|
||||
struct llama_memory_i;
|
||||
struct llama_memory_context_i;
|
||||
|
||||
// "memory" as in physical memory for a buffer type, in bytes
|
||||
struct llama_memory_breakdown_data {
|
||||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
// init scheduler and compute buffers, reserve worst-case graphs
|
||||
llama_context(
|
||||
@@ -46,10 +54,8 @@ struct llama_context {
|
||||
|
||||
llama_memory_t get_memory() const;
|
||||
|
||||
// return true of the KV cache was updated
|
||||
// TODO: remove
|
||||
bool kv_self_update(bool optimize);
|
||||
void kv_self_defrag_sched();
|
||||
// return true if the memory was updated
|
||||
bool memory_update(bool optimize);
|
||||
|
||||
enum llama_pooling_type pooling_type() const;
|
||||
|
||||
@@ -111,9 +117,9 @@ struct llama_context {
|
||||
size_t state_get_data( uint8_t * dst, size_t size);
|
||||
size_t state_set_data(const uint8_t * src, size_t size);
|
||||
|
||||
size_t state_seq_get_size(llama_seq_id seq_id);
|
||||
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
|
||||
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
|
||||
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
|
||||
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
|
||||
|
||||
bool state_load_file(
|
||||
const char * filepath,
|
||||
@@ -146,12 +152,15 @@ struct llama_context {
|
||||
llama_perf_context_data perf_get_data() const;
|
||||
void perf_reset();
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
// TODO: more flexible combinations of logical/physical batch size and context size
|
||||
void opt_epoch(
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train,
|
||||
@@ -197,7 +206,7 @@ public:
|
||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
|
||||
// reserve a graph with a dummy ubatch of the specified size
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
||||
|
||||
private:
|
||||
llm_graph_params graph_params(
|
||||
@@ -212,8 +221,8 @@ private:
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
size_t state_read_data (llama_io_read_i & io);
|
||||
|
||||
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
|
||||
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
|
||||
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
|
||||
//
|
||||
// members
|
||||
@@ -229,9 +238,6 @@ private:
|
||||
|
||||
std::unique_ptr<llama_memory_i> memory;
|
||||
|
||||
// TODO: temporary, until the llama_kv_self_defrag() API is removed
|
||||
bool memory_force_optimize = false;
|
||||
|
||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||
size_t logits_size = 0; // capacity (of floats) for logits
|
||||
float * logits = nullptr;
|
||||
@@ -287,10 +293,6 @@ private:
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = true;
|
||||
|
||||
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||
bool graph_reuse_disable = false;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user