ggml: Enable op_offload to improve partial offload performance

When a model is partially offloaded to system RAM, we can either do the calculations on the CPU or we can temporarily transfer the data to the GPU to do the calculations there. Small batches tend to be better on the CPU, large batches on the GPU. The llamarunner used the GPU in most cases and the ollamarunner used the CPU. Although the ollamarunner saw an improvement in token generation performance, there was a large performance hit in prompt processing (3-10x). There is an existing heuristic to dynamically switch between these two modes but in practice it doesn't have enough information to accurately make that decision. This adds authoritative data to make the check work to get the best of both worlds. Fixes #12037
2025-11-11 22:28:10 +01:00 · 2025-10-27 16:32:05 -07:00
parent 26465fb85f
commit afaf7ce8c3
15 changed files with 405 additions and 128 deletions
--- a/llama/patches/0022-ggml-No-alloc-mode.patch
+++ b/llama/patches/0022-ggml-No-alloc-mode.patch
@@ -16,7 +16,7 @@ must be recreated with no-alloc set to false before loading data.
 5 files changed, 310 insertions(+), 44 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 229bf387..1ff53ed0 100644
+index 2763f2bd6..b3b5b356a 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -305,6 +305,7 @@ extern "C" {
@@ -26,9 +26,9 @@ index 229bf387..1ff53ed0 100644
 +    GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
     GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
 
-     // Initialize backend buffers from a measure graph
+     // Provide a hint on the batch size to optimize processing (uses heuristics if unset)
 diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index 6792ba98..3c3f22fc 100644
+index 0f5b03cef..7bdf9d81f 100644
 --- a/ggml/src/ggml-backend-impl.h
 +++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" {
@@ -75,7 +75,7 @@ index 6792ba98..3c3f22fc 100644
 
     struct ggml_backend {
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index cb2b9956..6ef5eeaf 100644
+index 41eef3b5f..c81a2e48a 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@@ -121,8 +121,8 @@ index cb2b9956..6ef5eeaf 100644
     void * base = buffer->iface.get_base(buffer);
 
     GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
-@@ -723,6 +743,12 @@ struct ggml_backend_sched {
-     bool op_offload;
+@@ -725,6 +745,12 @@ struct ggml_backend_sched {
+     int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
 
     int debug;
 +
@@ -134,7 +134,7 @@ index cb2b9956..6ef5eeaf 100644
 };
 
 #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
-@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
         size_t graph_size,
         bool parallel,
         bool op_offload) {
@@ -152,7 +152,7 @@ index cb2b9956..6ef5eeaf 100644
     GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
     GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
-@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
+@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
                 sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
             }
         }
@@ -162,11 +162,12 @@ index cb2b9956..6ef5eeaf 100644
 
     sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
     sched->op_offload = op_offload;
+     sched->batch_size = -1;
 +    sched->alloc_buffers = alloc_buffers;
 
     ggml_backend_sched_reset(sched);
 
-@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
+@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
         for (int c = 0; c < sched->n_copies; c++) {
             ggml_backend_event_free(sched->events[b][c]);
         }
@@ -177,7 +178,7 @@ index cb2b9956..6ef5eeaf 100644
     }
     ggml_gallocr_free(sched->galloc);
     ggml_free(sched->ctx);
-@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
+@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
         return false;
     }
 
@@ -202,7 +203,7 @@ index cb2b9956..6ef5eeaf 100644
     ggml_backend_sched_reset(sched);
 
     return true;
-@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
+@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
     int backend_index = ggml_backend_sched_backend_id(sched, backend);
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
 
@@ -218,7 +219,7 @@ index cb2b9956..6ef5eeaf 100644
 
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
 diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index e0abde54..28d6bcd7 100644
+index e0abde542..28d6bcd71 100644
 --- a/ggml/src/ggml-cuda/common.cuh
 +++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,31 @@
@@ -299,7 +300,7 @@ index e0abde54..28d6bcd7 100644
 +    }
 };
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 827e3205..811462c7 100644
+index f4d4a4267..ac70dcac8 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
@@ -540,7 +541,7 @@ index 827e3205..811462c7 100644
 };
 
 ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-@@ -3011,6 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
+@@ -3003,6 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
 
 static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
     bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@@ -548,7 +549,7 @@ index 827e3205..811462c7 100644
     // flag used to determine whether it is an integrated_gpu
     const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
 
-@@ -3026,6 +3089,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+@@ -3018,6 +3081,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     continue;
                 }
 
@@ -560,15 +561,15 @@ index 827e3205..811462c7 100644
                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                 if (!disable_fusion) {
 
-@@ -3152,6 +3220,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
+@@ -3144,6 +3212,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
- static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 +    cuda_ctx->pool_set_alloc(true);
 
     ggml_cuda_set_device(cuda_ctx->device);
 
-@@ -3231,6 +3300,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
+@@ -3223,6 +3292,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     return GGML_STATUS_SUCCESS;
 }
 
@@ -640,7 +641,7 @@ index 827e3205..811462c7 100644
 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
-@@ -3271,6 +3405,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
+@@ -3263,6 +3397,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .event_record            = */ ggml_backend_cuda_event_record,
     /* .event_wait              = */ ggml_backend_cuda_event_wait,
     /* .graph_optimize          = */ NULL,