From 837379a94c03e505bbad965a31eb1aa7976edb3c Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 13 Aug 2025 15:43:33 -0700 Subject: [PATCH] discovery: fix cudart driver version (#11614) We prefer the nvcuda library, which reports driver versions. When we dropped cuda v11, we added a safety check for too-old drivers. What we missed was the cudart fallback discovery logic didn't have driver version wired up. This fixes cudart discovery to expose the driver version as well so we no longer reject all GPUs if nvcuda didn't work. --- discover/gpu.go | 2 ++ discover/gpu_info_cudart.c | 9 +++------ discover/gpu_info_cudart.h | 7 ++----- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/discover/gpu.go b/discover/gpu.go index 15bad44669..f6e3c9cb1c 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -263,6 +263,8 @@ func GetGPUInfo() GpuInfoList { var driverMinor int if cHandles.cudart != nil { C.cudart_bootstrap(*cHandles.cudart, C.int(i), &memInfo) + driverMajor = int(cHandles.cudart.driver_major) + driverMinor = int(cHandles.cudart.driver_minor) } else { C.nvcuda_bootstrap(*cHandles.nvcuda, C.int(i), &memInfo) driverMajor = int(cHandles.nvcuda.driver_major) diff --git a/discover/gpu_info_cudart.c b/discover/gpu_info_cudart.c index bc5115bfdd..76c17b9d8f 100644 --- a/discover/gpu_info_cudart.c +++ b/discover/gpu_info_cudart.c @@ -69,18 +69,15 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) { } int version = 0; - cudartDriverVersion_t driverVersion; - driverVersion.major = 0; - driverVersion.minor = 0; // Report driver version if we're in verbose mode, ignore errors ret = (*resp->ch.cudaDriverGetVersion)(&version); if (ret != CUDART_SUCCESS) { LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret); } else { - driverVersion.major = version / 1000; - driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; - LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); + resp->ch.driver_major = version / 1000; + resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10; + LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", resp->ch.driver_major, resp->ch.driver_minor); } ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices); diff --git a/discover/gpu_info_cudart.h b/discover/gpu_info_cudart.h index ff0c0af19f..893f3f7bd9 100644 --- a/discover/gpu_info_cudart.h +++ b/discover/gpu_info_cudart.h @@ -29,11 +29,6 @@ typedef struct cudartMemory_st { size_t used; } cudartMemory_t; -typedef struct cudartDriverVersion { - int major; - int minor; -} cudartDriverVersion_t; - typedef struct cudaUUID { unsigned char bytes[16]; } cudaUUID_t; @@ -123,6 +118,8 @@ typedef struct cudaDeviceProp { typedef struct cudart_handle { void *handle; uint16_t verbose; + int driver_major; + int driver_minor; cudartReturn_t (*cudaSetDevice)(int device); cudartReturn_t (*cudaDeviceSynchronize)(void); cudartReturn_t (*cudaDeviceReset)(void);