Update GGML to b6646 (#12245)

Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
2025-11-11 23:07:55 +01:00 · 2025-10-02 14:47:10 -07:00
parent fdb109469f
commit c68f367ef6
326 changed files with 30615 additions and 20624 deletions
--- a/llama/llama.cpp/src/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -17,9 +17,17 @@ class llama_batch_allocr;
 class llama_io_read_i;
 class llama_io_write_i;

+// "memory" as in abstract memory for the context
 struct llama_memory_i;
 struct llama_memory_context_i;

+// "memory" as in physical memory for a buffer type, in bytes
+struct llama_memory_breakdown_data {
+    size_t model   = 0; // memory allocated for the model
+    size_t context = 0; // memory allocated for the context
+    size_t compute = 0; // memory allocated for temporary compute buffers
+};
+
 struct llama_context {
    // init scheduler and compute buffers, reserve worst-case graphs
    llama_context(
@@ -46,10 +54,8 @@ struct llama_context {

    llama_memory_t get_memory() const;

-    // return true of the KV cache was updated
-    // TODO: remove
-    bool kv_self_update(bool optimize);
-    void kv_self_defrag_sched();
+    // return true if the memory was updated
+    bool memory_update(bool optimize);

    enum llama_pooling_type pooling_type() const;

@@ -111,9 +117,9 @@ struct llama_context {
    size_t state_get_data(      uint8_t * dst, size_t size);
    size_t state_set_data(const uint8_t * src, size_t size);

-    size_t state_seq_get_size(llama_seq_id seq_id);
-    size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size);
-    size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
+    size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
+    size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size, llama_state_seq_flags flags);
+    size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);

    bool state_load_file(
            const char * filepath,
@@ -146,12 +152,15 @@ struct llama_context {
    llama_perf_context_data perf_get_data() const;
    void perf_reset();

+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
+
    //
    // training
    //

    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);

+    // TODO: more flexible combinations of logical/physical batch size and context size
    void opt_epoch(
            ggml_opt_dataset_t      dataset,
            ggml_opt_result_t       result_train,
@@ -197,7 +206,7 @@ public:
    ggml_status graph_compute(ggml_cgraph * gf, bool batched);

    // reserve a graph with a dummy ubatch of the specified size
-    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);

 private:
    llm_graph_params graph_params(
@@ -212,8 +221,8 @@ private:
    size_t state_write_data(llama_io_write_i & io);
    size_t state_read_data (llama_io_read_i  & io);

-    size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
-    size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id);
+    size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
+    size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id, llama_state_seq_flags flags);

    //
    // members
@@ -229,9 +238,6 @@ private:

    std::unique_ptr<llama_memory_i> memory;

-    // TODO: temporary, until the llama_kv_self_defrag() API is removed
-    bool memory_force_optimize = false;
-
    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * logits      = nullptr;
@@ -287,10 +293,6 @@ private:

    bool has_evaluated_once = false;

-    // env: LLAMA_SET_ROWS (temporary)
-    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    bool supports_set_rows = true;
-
    // env: LLAMA_GRAPH_REUSE_DISABLE
    bool graph_reuse_disable = false;