Adapted rocm support to cgo based llama.cpp

2025-09-30 19:43:30 +02:00 · 2023-11-29 11:00:37 -08:00
parent f8ef4439e9
commit 35934b2e05
37 changed files with 1688 additions and 658 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -0,0 +1,119 @@
+//go:build linux || windows
+
+package gpu
+
+/*
+#include "gpu_info.h"
+
+*/
+import "C"
+import (
+	"fmt"
+	"log"
+	"sync"
+	"unsafe"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+type handles struct {
+	cuda *C.cuda_handle_t
+	rocm *C.rocm_handle_t
+}
+
+var gpuMutex sync.Mutex
+var gpuHandles *handles = nil
+
+// Note: gpuMutex must already be held
+func initGPUHandles() {
+	log.Printf("Detecting GPU type")
+	gpuHandles = &handles{nil, nil}
+	var resp C.cuda_init_resp_t
+	C.cuda_init(&resp)
+	if resp.err != nil {
+		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
+		C.free(unsafe.Pointer(resp.err))
+
+		var resp C.rocm_init_resp_t
+		C.rocm_init(&resp)
+		if resp.err != nil {
+			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			log.Printf("Radeon GPU detected")
+			rocm := resp.rh
+			gpuHandles.rocm = &rocm
+		}
+	} else {
+		log.Printf("Nvidia GPU detected")
+		cuda := resp.ch
+		gpuHandles.cuda = &cuda
+	}
+}
+
+func GetGPUInfo() GpuInfo {
+	// TODO - consider exploring lspci (and equivalent on windows) to check for
+	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
+	gpuMutex.Lock()
+	defer gpuMutex.Unlock()
+	if gpuHandles == nil {
+		initGPUHandles()
+	}
+
+	var memInfo C.mem_info_t
+	var resp GpuInfo
+	if gpuHandles.cuda != nil {
+		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
+		resp.Driver = "CUDA"
+	} else if gpuHandles.rocm != nil {
+		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+		resp.Driver = "ROCM"
+	} else {
+		C.cpu_check_ram(&memInfo)
+		resp.Driver = "CPU"
+	}
+	if memInfo.err != nil {
+		log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err))
+		C.free(unsafe.Pointer(memInfo.err))
+	}
+	resp.FreeMemory = uint64(memInfo.free)
+	resp.TotalMemory = uint64(memInfo.total)
+	return resp
+}
+
+func CheckVRAM() (int64, error) {
+	gpuInfo := GetGPUInfo()
+	if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" {
+		return int64(gpuInfo.FreeMemory), nil
+	}
+	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
+}
+
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
+	if opts.NumGPU != -1 {
+		return opts.NumGPU
+	}
+	info := GetGPUInfo()
+	if info.Driver == "CPU" {
+		return 0
+	}
+
+	/*
+		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
+		We can store the model weights and the kv cache in vram,
+		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
+	*/
+	bytesPerLayer := uint64(fileSizeBytes / numLayer)
+
+	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
+	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
+
+	// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
+	// if int64(layers) < numLayer {
+	// 	log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
+	// 	return 0
+	// }
+	log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer)
+
+	return layers
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -0,0 +1,34 @@
+//go:build darwin
+
+package gpu
+
+import "C"
+import (
+	"github.com/jmorganca/ollama/api"
+)
+
+// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
+func CheckVRAM() (int64, error) {
+	// TODO - assume metal, and return free memory?
+	return 0, nil
+
+}
+
+func GetGPUInfo() GpuInfo {
+	// TODO - Metal vs. x86 macs...
+
+	return GpuInfo{
+		Driver:      "METAL",
+		TotalMemory: 0,
+		FreeMemory:  0,
+	}
+}
+
+func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
+	// default to enable metal on macOS
+	return 1
+}
+
+func nativeInit() error {
+	return nil
+}
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -0,0 +1,49 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_H__
+#define __GPU_INFO_H__
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
+#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
+#define LOAD_ERR() dlerror()
+#define UNLOAD_LIBRARY(handle) dlclose(handle)
+#else
+#include <windows.h>
+#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
+#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
+#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
+
+// TODO - refactor this with proper error message handling on windows
+inline static char *LOAD_ERR() {
+  static char errbuf[8];
+  snprintf(errbuf, 8, "0x%lx", GetLastError());
+  return errbuf;
+}
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct mem_info {
+  uint64_t total;
+  uint64_t free;
+  char *err;  // If non-nill, caller responsible for freeing
+} mem_info_t;
+
+void cpu_check_ram(mem_info_t *resp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#include "gpu_info_cuda.h"
+#include "gpu_info_rocm.h"
+
+#endif  // __GPU_INFO_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -0,0 +1,42 @@
+#include "gpu_info.h"
+// Fallbacks for CPU mode
+
+#ifdef _WIN32
+#include <sysinfoapi.h>
+void cpu_check_ram(mem_info_t *resp) {
+  resp->err = NULL;
+  MEMORYSTATUSEX info;
+  if (GlobalMemoryStatusEx(&info) != 0) {
+    resp->total = info.ullTotalPhys;
+    resp->free = info.ullAvailPhys;
+  } else {
+    resp->err = strdup(LOAD_ERR());
+  }
+  return;
+}
+
+#elif __linux__
+#include <errno.h>
+#include <string.h>
+#include <sys/sysinfo.h>
+void cpu_check_ram(mem_info_t *resp) {
+  struct sysinfo info;
+  resp->err = NULL;
+  if (sysinfo(&info) != 0) {
+    resp->err = strdup(strerror(errno));
+  } else {
+    resp->total = info.totalram * info.mem_unit;
+    resp->free = info.freeram * info.mem_unit;
+  }
+  return;
+}
+
+#elif __APPLE__
+// TODO consider an Apple implementation that does something useful
+// mem_info_t cpu_check_ram() {
+//   mem_info_t resp = {0, 0, NULL};
+//   return resp;
+// }
+#else
+#error "Unsupported platform"
+#endif
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -0,0 +1,110 @@
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include "gpu_info_cuda.h"
+
+#include <string.h>
+
+#ifndef _WIN32
+const char *cuda_lib_paths[] = {
+    "libnvidia-ml.so",
+    "/usr/local/cuda/lib64/libnvidia-ml.so",
+    NULL,
+};
+#else
+const char *cuda_lib_paths[] = {
+    "nvml.dll",
+    "",
+    NULL,
+};
+#endif
+
+void cuda_init(cuda_init_resp_t *resp) {
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"nvmlInit_v2", (void *)&resp->ch.initFn},
+      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
+  };
+
+  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
+    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->ch.handle) {
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             cuda_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->ch.handle);
+      resp->ch.handle = NULL;
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+  return;
+}
+
+void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  nvmlDevice_t device;
+  nvmlMemory_t memInfo = {0};
+  nvmlReturn_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  if (h.handle == NULL) {
+    resp->err = strdup("nvml handle sn't initialized");
+    return;
+  }
+
+  ret = (*h.initFn)();
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO - handle multiple GPUs
+  ret = (*h.getHandle)(0, &device);
+  if (ret != NVML_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "unable to get device handle: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  ret = (*h.getMemInfo)(device, &memInfo);
+  if (ret != NVML_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  resp->total = memInfo.total;
+  resp->free = memInfo.free;
+
+  ret = (*h.shutdownFn)();
+  if (ret != NVML_SUCCESS) {
+    snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  return;
+}
+#endif  // __APPLE__
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -0,0 +1,35 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_CUDA_H__
+#define __GPU_INFO_CUDA_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+
+typedef struct cuda_handle {
+  void *handle;
+  nvmlReturn_t (*initFn)(void);
+  nvmlReturn_t (*shutdownFn)(void);
+  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
+} cuda_handle_t;
+
+typedef struct cuda_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  cuda_handle_t ch;
+} cuda_init_resp_t;
+
+void cuda_init(cuda_init_resp_t *resp);
+void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+
+#endif  // __GPU_INFO_CUDA_H__
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -0,0 +1,111 @@
+#ifndef __APPLE__
+
+#include "gpu_info_rocm.h"
+
+#include <string.h>
+
+#ifndef _WIN32
+const char *rocm_lib_paths[] = {
+    "librocm_smi64.so",
+    "/opt/rocm/lib/librocm_smi64.so",
+    NULL,
+};
+#else
+// TODO untested
+const char *rocm_lib_paths[] = {
+    "rocm_smi64.dll",
+    "/opt/rocm/lib/rocm_smi64.dll",
+    NULL,
+};
+#endif
+
+void rocm_init(rocm_init_resp_t *resp) {
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup {
+    char *s;
+    void **p;
+  } l[4] = {
+      {"rsmi_init", (void *)&resp->rh.initFn},
+      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
+      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
+  };
+
+  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
+    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
+  }
+  if (!resp->rh.handle) {
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Radeon GPUs: %s\n",
+             rocm_lib_paths[0], LOAD_ERR());
+    resp->err = strdup(buf);
+    return;
+  }
+
+  for (i = 0; i < 4; i++) {
+    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
+    if (!l[i].p) {
+      UNLOAD_LIBRARY(resp->rh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               LOAD_ERR());
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+  return;
+}
+
+void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
+  resp->err = NULL;
+  // uint32_t num_devices;
+  // uint16_t device;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  rsmi_status_t ret;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  ret = (*h.initFn)(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO - iterate through devices...  ret =
+  // rsmi_num_monitor_devices(&num_devices);
+
+  // ret = (*h.getHandle)(0, &device);
+  // if (ret != RSMI_STATUS_SUCCESS) {
+  //     printf("rocm vram device lookup failure: %d\n", ret);
+  //     return -1;
+  // }
+
+  // Get total memory - used memory for available memory
+  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    (*h.shutdownFn)();
+    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  (*h.shutdownFn)();
+  resp->total = totalMem;
+  resp->free = totalMem - usedMem;
+  return;
+}
+
+#endif  // __APPLE__
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -0,0 +1,36 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ROCM_H__
+#define __GPU_INFO_ROCM_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum rsmi_status_return {
+  RSMI_STATUS_SUCCESS = 0,
+  // Other values omitted for now...
+} rsmi_status_t;
+
+typedef enum rsmi_memory_type {
+  RSMI_MEM_TYPE_VRAM = 0,
+  RSMI_MEM_TYPE_VIS_VRAM,
+  RSMI_MEM_TYPE_GTT,
+} rsmi_memory_type_t;
+
+typedef struct rocm_handle {
+  void *handle;
+  rsmi_status_t (*initFn)(uint64_t);
+  rsmi_status_t (*shutdownFn)(void);
+  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
+} rocm_handle_t;
+
+typedef struct rocm_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  rocm_handle_t rh;
+} rocm_init_resp_t;
+
+void rocm_init(rocm_init_resp_t *resp);
+void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
+
+#endif  // __GPU_INFO_ROCM_H__
+#endif  // __APPLE__
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -0,0 +1,26 @@
+package gpu
+
+import (
+	"runtime"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBasicGetGPUInfo(t *testing.T) {
+	info := GetGPUInfo()
+	assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver)
+
+	switch runtime.GOOS {
+	case "darwin":
+		// TODO - remove this once MacOS returns some size for CPU
+		return
+	case "linux", "windows":
+		assert.Greater(t, info.TotalMemory, uint64(0))
+		assert.Greater(t, info.FreeMemory, uint64(0))
+	default:
+		return
+	}
+}
+
+// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -0,0 +1,10 @@
+package gpu
+
+// Beginning of an `ollama info` command
+type GpuInfo struct {
+	Driver      string `json:"driver,omitempty"`
+	TotalMemory uint64 `json:"total_memory,omitempty"`
+	FreeMemory  uint64 `json:"free_memory,omitempty"`
+
+	// TODO add other useful attributes about the card here for discovery information
+}