ggml: Enable op_offload to improve partial offload performance

When a model is partially offloaded to system RAM, we can either
do the calculations on the CPU or we can temporarily transfer the
data to the GPU to do the calculations there. Small batches tend
to be better on the CPU, large batches on the GPU.

The llamarunner used the GPU in most cases and the ollamarunner
used the CPU. Although the ollamarunner saw an improvement in
token generation performance, there was a large performance hit
in prompt processing (3-10x).

There is an existing heuristic to dynamically switch between these
two modes but in practice it doesn't have enough information to
accurately make that decision. This adds authoritative data to make
the check work to get the best of both worlds.

Fixes #12037
This commit is contained in:
Jesse Gross
2025-10-27 16:32:05 -07:00
committed by Jesse Gross
parent 26465fb85f
commit afaf7ce8c3
15 changed files with 405 additions and 128 deletions

View File

@@ -16,7 +16,7 @@ must be recreated with no-alloc set to false before loading data.
5 files changed, 310 insertions(+), 44 deletions(-)
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 229bf387..1ff53ed0 100644
index 2763f2bd6..b3b5b356a 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -305,6 +305,7 @@ extern "C" {
@@ -26,9 +26,9 @@ index 229bf387..1ff53ed0 100644
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
// Provide a hint on the batch size to optimize processing (uses heuristics if unset)
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index 6792ba98..3c3f22fc 100644
index 0f5b03cef..7bdf9d81f 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -26,12 +26,17 @@ extern "C" {
@@ -75,7 +75,7 @@ index 6792ba98..3c3f22fc 100644
struct ggml_backend {
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index cb2b9956..6ef5eeaf 100644
index 41eef3b5f..c81a2e48a 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
@@ -121,8 +121,8 @@ index cb2b9956..6ef5eeaf 100644
void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
@@ -723,6 +743,12 @@ struct ggml_backend_sched {
bool op_offload;
@@ -725,6 +745,12 @@ struct ggml_backend_sched {
int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics
int debug;
+
@@ -134,7 +134,7 @@ index cb2b9956..6ef5eeaf 100644
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new(
size_t graph_size,
bool parallel,
bool op_offload) {
@@ -152,7 +152,7 @@ index cb2b9956..6ef5eeaf 100644
GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new(
@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);
}
}
@@ -162,11 +162,12 @@ index cb2b9956..6ef5eeaf 100644
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
sched->op_offload = op_offload;
sched->batch_size = -1;
+ sched->alloc_buffers = alloc_buffers;
ggml_backend_sched_reset(sched);
@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
for (int c = 0; c < sched->n_copies; c++) {
ggml_backend_event_free(sched->events[b][c]);
}
@@ -177,7 +178,7 @@ index cb2b9956..6ef5eeaf 100644
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
return false;
}
@@ -202,7 +203,7 @@ index cb2b9956..6ef5eeaf 100644
ggml_backend_sched_reset(sched);
return true;
@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched,
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@@ -218,7 +219,7 @@ index cb2b9956..6ef5eeaf 100644
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index e0abde54..28d6bcd7 100644
index e0abde542..28d6bcd71 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -35,6 +35,31 @@
@@ -299,7 +300,7 @@ index e0abde54..28d6bcd7 100644
+ }
};
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 827e3205..811462c7 100644
index f4d4a4267..ac70dcac8 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
@@ -540,7 +541,7 @@ index 827e3205..811462c7 100644
};
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
@@ -3011,6 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
@@ -3003,6 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
@@ -548,7 +549,7 @@ index 827e3205..811462c7 100644
// flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
@@ -3026,6 +3089,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -3018,6 +3081,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
continue;
}
@@ -560,15 +561,15 @@ index 827e3205..811462c7 100644
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
@@ -3152,6 +3220,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
@@ -3144,6 +3212,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+ cuda_ctx->pool_set_alloc(true);
ggml_cuda_set_device(cuda_ctx->device);
@@ -3231,6 +3300,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
@@ -3223,6 +3292,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
return GGML_STATUS_SUCCESS;
}
@@ -640,7 +641,7 @@ index 827e3205..811462c7 100644
static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@@ -3271,6 +3405,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
@@ -3263,6 +3397,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,