mirror of
https://github.com/ollama/ollama.git
synced 2025-11-10 22:07:45 +01:00
* feat: Bump llama.cpp to df1b612 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Correctly encode text chunks during mtmd tokenization There can be text chunks that appear interspersed with the image embeddings that contain template delimiter tokens for some models. These need to be correctly translated to text tokens. Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * tests: Use MtmdChunk in image_test Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Fix unnecessary conversion linting Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(ggml): Revert changes to ggml_hip.cpp These changes were done largely by our code assistant and are likely wrong Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Revert changes in mem_nvml.cpp Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update sync point to 1deee0 This brings in several more optimization commits and model support for EmbeddingGemma Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches for 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: sync for bump to 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Bad patch updates with errant `+` Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Bump llama.cpp/ggml to 7049736 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: format-patches after latest bump Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
161 lines
7.4 KiB
Diff
161 lines
7.4 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Jesse Gross <jesse@ollama.com>
|
|
Date: Thu, 24 Apr 2025 14:48:51 -0700
|
|
Subject: [PATCH] ggml: Export GPU UUIDs
|
|
|
|
This enables matching up devices and information reported by the backend
|
|
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
|
---
|
|
ggml/include/ggml-backend.h | 1 +
|
|
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++---
|
|
ggml/src/ggml-metal/ggml-metal.cpp | 1 +
|
|
3 files changed, 63 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
|
index c54ff98b..229bf387 100644
|
|
--- a/ggml/include/ggml-backend.h
|
|
+++ b/ggml/include/ggml-backend.h
|
|
@@ -158,6 +158,7 @@ extern "C" {
|
|
const char * description;
|
|
// device free memory in bytes
|
|
size_t memory_free;
|
|
+ const char * id;
|
|
// device total memory in bytes
|
|
size_t memory_total;
|
|
// device type
|
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
index c0b1e4c1..5b852f69 100644
|
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
@@ -183,6 +183,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
|
}
|
|
#endif // defined(GGML_USE_HIP)
|
|
|
|
+static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) {
|
|
+ char id[64];
|
|
+
|
|
+#if !defined(GGML_USE_HIP)
|
|
+ snprintf(id, sizeof(id),
|
|
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
|
+ (unsigned char)prop.uuid.bytes[0],
|
|
+ (unsigned char)prop.uuid.bytes[1],
|
|
+ (unsigned char)prop.uuid.bytes[2],
|
|
+ (unsigned char)prop.uuid.bytes[3],
|
|
+ (unsigned char)prop.uuid.bytes[4],
|
|
+ (unsigned char)prop.uuid.bytes[5],
|
|
+ (unsigned char)prop.uuid.bytes[6],
|
|
+ (unsigned char)prop.uuid.bytes[7],
|
|
+ (unsigned char)prop.uuid.bytes[8],
|
|
+ (unsigned char)prop.uuid.bytes[9],
|
|
+ (unsigned char)prop.uuid.bytes[10],
|
|
+ (unsigned char)prop.uuid.bytes[11],
|
|
+ (unsigned char)prop.uuid.bytes[12],
|
|
+ (unsigned char)prop.uuid.bytes[13],
|
|
+ (unsigned char)prop.uuid.bytes[14],
|
|
+ (unsigned char)prop.uuid.bytes[15]
|
|
+ );
|
|
+#else
|
|
+#ifdef _WIN32
|
|
+ snprintf(id, sizeof(id), "%d", device_num);
|
|
+#else
|
|
+ try {
|
|
+ std::string uuid = std::string(prop.uuid.bytes, 16);
|
|
+
|
|
+ size_t pos = 0;
|
|
+ unsigned long long v = stoull(uuid, &pos, 16);
|
|
+ if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-'))
|
|
+ throw std::invalid_argument("invalid uuid");
|
|
+
|
|
+ snprintf(id, sizeof(id), "GPU-%016llx", v);
|
|
+ } catch (const std::exception &e) {
|
|
+ snprintf(id, sizeof(id), "%d", device_num);
|
|
+ }
|
|
+#endif
|
|
+#endif
|
|
+
|
|
+ return id;
|
|
+}
|
|
+
|
|
static ggml_cuda_device_info ggml_cuda_init() {
|
|
ggml_cuda_device_info info = {};
|
|
|
|
@@ -249,22 +294,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
info.devices[id].cc += prop.minor * 0x10;
|
|
}
|
|
}
|
|
- GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
|
|
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
|
|
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
|
- device_vmm ? "yes" : "no", prop.warpSize);
|
|
+ device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
|
|
#elif defined(GGML_USE_MUSA)
|
|
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
|
|
info.devices[id].warp_size = 32;
|
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
|
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
|
|
info.devices[id].cc += prop.minor * 0x10;
|
|
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
|
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
|
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
|
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
|
+ ggml_cuda_parse_uuid(prop, id).c_str());
|
|
#else
|
|
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
|
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
|
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
|
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
|
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
|
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
|
+ ggml_cuda_parse_uuid(prop, id).c_str());
|
|
std::string device_name(prop.name);
|
|
if (device_name == "NVIDIA GeForce MX450") {
|
|
turing_devices_without_mma.push_back({ id, device_name });
|
|
@@ -3276,6 +3323,7 @@ struct ggml_backend_cuda_device_context {
|
|
std::string name;
|
|
std::string description;
|
|
std::string pci_bus_id;
|
|
+ std::string id;
|
|
};
|
|
|
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
|
@@ -3288,6 +3336,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
|
return ctx->description.c_str();
|
|
}
|
|
|
|
+static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) {
|
|
+ ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
|
+ return ctx->id.c_str();
|
|
+}
|
|
+
|
|
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
|
ggml_cuda_set_device(ctx->device);
|
|
@@ -3304,6 +3357,7 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
|
|
|
|
props->name = ggml_backend_cuda_device_get_name(dev);
|
|
props->description = ggml_backend_cuda_device_get_description(dev);
|
|
+ props->id = ggml_backend_cuda_device_get_id(dev);
|
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
|
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
|
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
|
@@ -3873,6 +3927,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|
cudaDeviceProp prop;
|
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
|
dev_ctx->description = prop.name;
|
|
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
|
|
|
char pci_bus_id[16] = {};
|
|
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
|
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
|
index bf096227..f2ff9f32 100644
|
|
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
|
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
|
@@ -538,6 +538,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
|
static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
|
props->name = ggml_backend_metal_device_get_name(dev);
|
|
props->description = ggml_backend_metal_device_get_description(dev);
|
|
+ props->id = "0";
|
|
props->type = ggml_backend_metal_device_get_type(dev);
|
|
|
|
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|