mirror of
https://github.com/ollama/ollama.git
synced 2025-11-10 19:18:06 +01:00
On the llama runner, after the recent GGML bump a new log line reports incorrect 0 MiB free after our patch to remove memory from the props. This adjusts the llama.cpp code to fetch the actual free memory of the active device.
142 lines
6.7 KiB
Diff
142 lines
6.7 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Jesse Gross <jesse@ollama.com>
|
|
Date: Wed, 27 Aug 2025 14:39:48 -0700
|
|
Subject: [PATCH] ggml: Enable resetting backend devices
|
|
|
|
Touching a CUDA device causes the allocation of a primary context
|
|
with CUDA data structures (~300 MB of VRAM). If a device is
|
|
unused then it can be reset to free these data structures.
|
|
---
|
|
ggml/include/ggml-backend.h | 1 +
|
|
ggml/src/ggml-backend-impl.h | 4 ++++
|
|
ggml/src/ggml-backend.cpp | 8 ++++++++
|
|
ggml/src/ggml-cuda/ggml-cuda.cu | 16 +++++++++++++++-
|
|
ggml/src/ggml-cuda/vendors/hip.h | 1 +
|
|
src/llama.cpp | 4 +++-
|
|
6 files changed, 32 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
|
index 1ff53ed03..ba181d09d 100644
|
|
--- a/ggml/include/ggml-backend.h
|
|
+++ b/ggml/include/ggml-backend.h
|
|
@@ -178,6 +178,7 @@ extern "C" {
|
|
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
|
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
|
+ GGML_API void ggml_backend_dev_reset(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
|
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
|
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
|
index 3c3f22fc0..43c91d9f2 100644
|
|
--- a/ggml/src/ggml-backend-impl.h
|
|
+++ b/ggml/src/ggml-backend-impl.h
|
|
@@ -195,6 +195,10 @@ extern "C" {
|
|
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
|
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
|
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
|
+
|
|
+ // (optional) reset device, clearing existing allocations and context
|
|
+ // the caller must ensure that there are no outstanding buffers, as these will become invalid
|
|
+ void (*reset)(ggml_backend_dev_t dev);
|
|
};
|
|
|
|
struct ggml_backend_device {
|
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
|
index 6ef5eeafa..0b757af59 100644
|
|
--- a/ggml/src/ggml-backend.cpp
|
|
+++ b/ggml/src/ggml-backend.cpp
|
|
@@ -526,6 +526,14 @@ ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * par
|
|
return device->iface.init_backend(device, params);
|
|
}
|
|
|
|
+void ggml_backend_dev_reset(ggml_backend_dev_t device) {
|
|
+ if (device->iface.reset == NULL) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ device->iface.reset(device);
|
|
+}
|
|
+
|
|
ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
|
|
GGML_ASSERT(device);
|
|
return device->iface.get_buffer_type(device);
|
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
index 811462c79..87c6c34a4 100644
|
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
|
|
return id;
|
|
}
|
|
|
|
+void ggml_cuda_reset_device(int device) {
|
|
+ ggml_cuda_set_device(device);
|
|
+ CUDA_CHECK(cudaDeviceReset());
|
|
+}
|
|
+
|
|
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
|
ggml_cuda_set_device(device);
|
|
cudaError_t err;
|
|
@@ -3515,7 +3520,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
|
props->id = ggml_backend_cuda_device_get_id(dev);
|
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
|
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
|
- ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
+
|
|
+ // Memory reporting is disabled to avoid allocation of a CUDA primary context (~300 MB per device).
|
|
+ // If you need the memory data, call ggml_backend_dev_memory() explicitly.
|
|
+ props->memory_total = props->memory_free = 0;
|
|
|
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
|
@@ -3948,6 +3956,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
|
|
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
|
}
|
|
|
|
+static void ggml_backend_cuda_device_reset(ggml_backend_dev_t dev) {
|
|
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
|
+ ggml_cuda_reset_device(ctx->device);
|
|
+}
|
|
+
|
|
static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
|
/* .get_name = */ ggml_backend_cuda_device_get_name,
|
|
/* .get_description = */ ggml_backend_cuda_device_get_description,
|
|
@@ -3964,6 +3977,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
|
|
/* .event_new = */ ggml_backend_cuda_device_event_new,
|
|
/* .event_free = */ ggml_backend_cuda_device_event_free,
|
|
/* .event_synchronize = */ ggml_backend_cuda_device_event_synchronize,
|
|
+ /* .reset = */ ggml_backend_cuda_device_reset,
|
|
};
|
|
|
|
// backend reg
|
|
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
|
|
index 890c10364..1f06be80e 100644
|
|
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
|
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
|
@@ -45,6 +45,7 @@
|
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
#define cudaDeviceProp hipDeviceProp_t
|
|
+#define cudaDeviceReset hipDeviceReset
|
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
|
#define cudaError_t hipError_t
|
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
index fe5a7a835..d821a96a0 100644
|
|
--- a/src/llama.cpp
|
|
+++ b/src/llama.cpp
|
|
@@ -267,10 +267,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
for (auto * dev : model->devices) {
|
|
ggml_backend_dev_props props;
|
|
ggml_backend_dev_get_props(dev, &props);
|
|
+ size_t memory_free, memory_total;
|
|
+ ggml_backend_dev_memory(dev, &memory_free, &memory_total);
|
|
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
|
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
props.device_id ? props.device_id : "unknown id",
|
|
- props.memory_free/1024/1024);
|
|
+ memory_free/1024/1024);
|
|
}
|
|
|
|
const int status = llama_model_load(path_model, splits, *model, params);
|