Detect very old CUDA GPUs and fall back to CPU

If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode.
2025-10-11 14:53:37 +02:00 · 2024-01-06 21:40:04 -08:00
parent 57942b4676
commit d74ce6bd4f
3 changed files with 74 additions and 2 deletions
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -22,6 +22,7 @@ typedef struct cuda_handle {
  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
  nvmlReturn_t (*getCount)(unsigned int *);
+  nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
 } cuda_handle_t;

 typedef struct cuda_init_resp {
@@ -29,8 +30,15 @@ typedef struct cuda_init_resp {
  cuda_handle_t ch;
 } cuda_init_resp_t;

+typedef struct cuda_compute_capability {
+  char *err;
+  int major;
+  int minor;
+} cuda_compute_capability_t;
+
 void cuda_init(cuda_init_resp_t *resp);
 void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
+void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);

 #endif  // __GPU_INFO_CUDA_H__
 #endif  // __APPLE__