2023-11-29 11:00:37 -08:00
//go:build linux || windows
2024-10-16 17:45:00 -07:00
package discover
2023-11-29 11:00:37 -08:00
/ *
2023-12-13 17:26:47 -08:00
# cgo linux LDFLAGS : - lrt - lpthread - ldl - lstdc ++ - lm
# cgo windows LDFLAGS : - lpthread
2023-11-29 11:00:37 -08:00
# include "gpu_info.h"
* /
import "C"
2024-08-01 14:52:15 -07:00
2023-11-29 11:00:37 -08:00
import (
"fmt"
2024-01-18 10:52:01 -08:00
"log/slog"
2024-01-10 14:39:51 -08:00
"os"
"path/filepath"
2023-12-23 11:35:44 -08:00
"runtime"
build: Make target improvements (#7499)
* llama: wire up builtin runner
This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build. After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.
* build: Make target improvements
Add a few new targets and help for building locally.
This also adjusts the runner lookup to favor local builds, then
runners relative to the executable, and finally payloads.
* Support customized CPU flags for runners
This implements a simplified custom CPU flags pattern for the runners.
When built without overrides, the runner name contains the vector flag
we check for (AVX) to ensure we don't try to run on unsupported systems
and crash. If the user builds a customized set, we omit the naming
scheme and don't check for compatibility. This avoids checking
requirements at runtime, so that logic has been removed as well. This
can be used to build GPU runners with no vector flags, or CPU/GPU
runners with additional flags (e.g. AVX512) enabled.
* Use relative paths
If the user checks out the repo in a path that contains spaces, make gets
really confused so use relative paths for everything in-repo to avoid breakage.
* Remove payloads from main binary
* install: clean up prior libraries
This removes support for v0.3.6 and older versions (before the tar bundle)
and ensures we clean up prior libraries before extracting the bundle(s).
Without this change, runners and dependent libraries could leak when we
update and lead to subtle runtime errors.
2024-12-10 09:47:19 -08:00
"strconv"
2024-01-10 14:39:51 -08:00
"strings"
2023-11-29 11:00:37 -08:00
"sync"
"unsafe"
2024-03-18 10:45:22 +01:00
2024-06-03 08:31:48 -07:00
"github.com/ollama/ollama/envconfig"
2024-05-15 15:13:16 -07:00
"github.com/ollama/ollama/format"
2023-11-29 11:00:37 -08:00
)
2024-05-29 16:37:34 -07:00
type cudaHandles struct {
2024-03-30 09:50:05 -07:00
deviceCount int
cudart * C . cudart_handle_t
2024-04-30 16:42:48 -07:00
nvcuda * C . nvcuda_handle_t
2024-06-03 15:07:50 -07:00
nvml * C . nvml_handle_t
2024-05-29 16:37:34 -07:00
}
type oneapiHandles struct {
2024-05-24 11:18:27 +08:00
oneapi * C . oneapi_handle_t
2024-05-29 16:37:34 -07:00
deviceCount int
2023-11-29 11:00:37 -08:00
}
2024-03-18 10:45:22 +01:00
const (
2024-05-10 09:15:28 -07:00
cudaMinimumMemory = 457 * format . MebiByte
rocmMinimumMemory = 457 * format . MebiByte
2024-05-29 16:37:34 -07:00
// TODO OneAPI minimum memory
2024-03-18 10:45:22 +01:00
)
2024-05-15 15:13:16 -07:00
var (
gpuMutex sync . Mutex
bootstrapped bool
cpus [ ] CPUInfo
cudaGPUs [ ] CudaGPUInfo
nvcudaLibPath string
cudartLibPath string
oneapiLibPath string
2024-06-03 15:07:50 -07:00
nvmlLibPath string
2024-05-15 15:13:16 -07:00
rocmGPUs [ ] RocmGPUInfo
oneapiGPUs [ ] OneapiGPUInfo
2024-10-14 16:26:45 -07:00
// If any discovered GPUs are incompatible, report why
unsupportedGPUs [ ] UnsupportedGPUInfo
// Keep track of errors during bootstrapping so that if GPUs are missing
// they expected to be present this may explain why
bootstrapErrors [ ] error
2024-05-15 15:13:16 -07:00
)
2023-11-29 11:00:37 -08:00
2024-01-20 12:15:50 -08:00
// With our current CUDA compile flags, older than 5.0 will not work properly
build: Make target improvements (#7499)
* llama: wire up builtin runner
This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build. After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.
* build: Make target improvements
Add a few new targets and help for building locally.
This also adjusts the runner lookup to favor local builds, then
runners relative to the executable, and finally payloads.
* Support customized CPU flags for runners
This implements a simplified custom CPU flags pattern for the runners.
When built without overrides, the runner name contains the vector flag
we check for (AVX) to ensure we don't try to run on unsupported systems
and crash. If the user builds a customized set, we omit the naming
scheme and don't check for compatibility. This avoids checking
requirements at runtime, so that logic has been removed as well. This
can be used to build GPU runners with no vector flags, or CPU/GPU
runners with additional flags (e.g. AVX512) enabled.
* Use relative paths
If the user checks out the repo in a path that contains spaces, make gets
really confused so use relative paths for everything in-repo to avoid breakage.
* Remove payloads from main binary
* install: clean up prior libraries
This removes support for v0.3.6 and older versions (before the tar bundle)
and ensures we clean up prior libraries before extracting the bundle(s).
Without this change, runners and dependent libraries could leak when we
update and lead to subtle runtime errors.
2024-12-10 09:47:19 -08:00
// (string values used to allow ldflags overrides at build time)
var (
CudaComputeMajorMin = "5"
CudaComputeMinorMin = "0"
)
2024-01-06 21:40:04 -08:00
build: Make target improvements (#7499)
* llama: wire up builtin runner
This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build. After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.
* build: Make target improvements
Add a few new targets and help for building locally.
This also adjusts the runner lookup to favor local builds, then
runners relative to the executable, and finally payloads.
* Support customized CPU flags for runners
This implements a simplified custom CPU flags pattern for the runners.
When built without overrides, the runner name contains the vector flag
we check for (AVX) to ensure we don't try to run on unsupported systems
and crash. If the user builds a customized set, we omit the naming
scheme and don't check for compatibility. This avoids checking
requirements at runtime, so that logic has been removed as well. This
can be used to build GPU runners with no vector flags, or CPU/GPU
runners with additional flags (e.g. AVX512) enabled.
* Use relative paths
If the user checks out the repo in a path that contains spaces, make gets
really confused so use relative paths for everything in-repo to avoid breakage.
* Remove payloads from main binary
* install: clean up prior libraries
This removes support for v0.3.6 and older versions (before the tar bundle)
and ensures we clean up prior libraries before extracting the bundle(s).
Without this change, runners and dependent libraries could leak when we
update and lead to subtle runtime errors.
2024-12-10 09:47:19 -08:00
var RocmComputeMajorMin = "9"
2024-01-10 14:39:51 -08:00
2024-03-30 09:50:05 -07:00
// TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format . GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
2024-01-10 14:39:51 -08:00
2023-11-29 11:00:37 -08:00
// Note: gpuMutex must already be held
2024-05-29 16:37:34 -07:00
func initCudaHandles ( ) * cudaHandles {
2023-12-13 17:26:47 -08:00
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
2024-01-10 14:39:51 -08:00
2024-05-29 16:37:34 -07:00
cHandles := & cudaHandles { }
2024-05-15 15:13:16 -07:00
// Short Circuit if we already know which library to use
2024-10-14 16:26:45 -07:00
// ignore bootstrap errors in this case since we already recorded them
2024-06-03 15:07:50 -07:00
if nvmlLibPath != "" {
2024-10-14 16:26:45 -07:00
cHandles . nvml , _ , _ = loadNVMLMgmt ( [ ] string { nvmlLibPath } )
2024-06-03 15:07:50 -07:00
return cHandles
}
2024-05-15 15:13:16 -07:00
if nvcudaLibPath != "" {
2024-10-14 16:26:45 -07:00
cHandles . deviceCount , cHandles . nvcuda , _ , _ = loadNVCUDAMgmt ( [ ] string { nvcudaLibPath } )
2024-05-29 16:37:34 -07:00
return cHandles
2024-05-15 15:13:16 -07:00
}
if cudartLibPath != "" {
2024-10-14 16:26:45 -07:00
cHandles . deviceCount , cHandles . cudart , _ , _ = loadCUDARTMgmt ( [ ] string { cudartLibPath } )
2024-05-29 16:37:34 -07:00
return cHandles
2024-05-15 15:13:16 -07:00
}
slog . Debug ( "searching for GPU discovery libraries for NVIDIA" )
2024-03-25 11:07:44 -04:00
var cudartMgmtPatterns [ ] string
2024-06-03 15:07:50 -07:00
2024-06-05 12:07:20 -07:00
// Aligned with driver, we can't carry as payloads
nvcudaMgmtPatterns := NvcudaGlobs
2025-01-29 15:03:38 -08:00
cudartMgmtPatterns = append ( cudartMgmtPatterns , filepath . Join ( LibOllamaPath , "cuda_v*" , CudartMgmtName ) )
2024-06-05 12:07:20 -07:00
cudartMgmtPatterns = append ( cudartMgmtPatterns , CudartGlobs ... )
2024-01-10 14:39:51 -08:00
2024-06-05 12:07:20 -07:00
if len ( NvmlGlobs ) > 0 {
nvmlLibPaths := FindGPULibs ( NvmlMgmtName , NvmlGlobs )
2024-06-03 15:07:50 -07:00
if len ( nvmlLibPaths ) > 0 {
2024-10-14 16:26:45 -07:00
nvml , libPath , err := loadNVMLMgmt ( nvmlLibPaths )
2024-06-03 15:07:50 -07:00
if nvml != nil {
slog . Debug ( "nvidia-ml loaded" , "library" , libPath )
cHandles . nvml = nvml
nvmlLibPath = libPath
}
2024-10-14 16:26:45 -07:00
if err != nil {
bootstrapErrors = append ( bootstrapErrors , err )
}
2024-06-03 15:07:50 -07:00
}
}
2024-06-05 12:07:20 -07:00
nvcudaLibPaths := FindGPULibs ( NvcudaMgmtName , nvcudaMgmtPatterns )
2024-04-30 16:42:48 -07:00
if len ( nvcudaLibPaths ) > 0 {
2024-10-14 16:26:45 -07:00
deviceCount , nvcuda , libPath , err := loadNVCUDAMgmt ( nvcudaLibPaths )
2024-04-30 16:42:48 -07:00
if nvcuda != nil {
2024-05-07 14:54:26 -07:00
slog . Debug ( "detected GPUs" , "count" , deviceCount , "library" , libPath )
2024-05-29 16:37:34 -07:00
cHandles . nvcuda = nvcuda
cHandles . deviceCount = deviceCount
2024-05-15 15:13:16 -07:00
nvcudaLibPath = libPath
2024-05-29 16:37:34 -07:00
return cHandles
2024-04-30 16:42:48 -07:00
}
2024-10-14 16:26:45 -07:00
if err != nil {
bootstrapErrors = append ( bootstrapErrors , err )
}
2024-04-30 16:42:48 -07:00
}
2024-06-05 12:07:20 -07:00
cudartLibPaths := FindGPULibs ( CudartMgmtName , cudartMgmtPatterns )
2024-03-25 11:07:44 -04:00
if len ( cudartLibPaths ) > 0 {
2024-10-14 16:26:45 -07:00
deviceCount , cudart , libPath , err := loadCUDARTMgmt ( cudartLibPaths )
2024-03-25 11:07:44 -04:00
if cudart != nil {
2024-05-07 14:54:26 -07:00
slog . Debug ( "detected GPUs" , "library" , libPath , "count" , deviceCount )
2024-05-29 16:37:34 -07:00
cHandles . cudart = cudart
cHandles . deviceCount = deviceCount
2024-05-15 15:13:16 -07:00
cudartLibPath = libPath
2024-05-29 16:37:34 -07:00
return cHandles
2024-03-25 11:07:44 -04:00
}
2024-10-14 16:26:45 -07:00
if err != nil {
bootstrapErrors = append ( bootstrapErrors , err )
}
2024-03-25 11:07:44 -04:00
}
2024-05-24 11:18:27 +08:00
2024-05-29 16:37:34 -07:00
return cHandles
}
// Note: gpuMutex must already be held
func initOneAPIHandles ( ) * oneapiHandles {
oHandles := & oneapiHandles { }
// Short Circuit if we already know which library to use
2024-10-14 16:26:45 -07:00
// ignore bootstrap errors in this case since we already recorded them
2024-05-29 16:37:34 -07:00
if oneapiLibPath != "" {
2024-10-14 16:26:45 -07:00
oHandles . deviceCount , oHandles . oneapi , _ , _ = loadOneapiMgmt ( [ ] string { oneapiLibPath } )
2024-05-29 16:37:34 -07:00
return oHandles
}
2024-06-05 12:07:20 -07:00
oneapiLibPaths := FindGPULibs ( OneapiMgmtName , OneapiGlobs )
2024-06-03 08:31:48 -07:00
if len ( oneapiLibPaths ) > 0 {
2024-10-14 16:26:45 -07:00
var err error
oHandles . deviceCount , oHandles . oneapi , oneapiLibPath , err = loadOneapiMgmt ( oneapiLibPaths )
if err != nil {
bootstrapErrors = append ( bootstrapErrors , err )
}
2024-06-03 08:31:48 -07:00
}
2024-05-29 16:37:34 -07:00
return oHandles
2023-11-29 11:00:37 -08:00
}
2024-06-03 19:09:23 -07:00
func GetCPUInfo ( ) GpuInfoList {
gpuMutex . Lock ( )
if ! bootstrapped {
gpuMutex . Unlock ( )
GetGPUInfo ( )
} else {
gpuMutex . Unlock ( )
}
return GpuInfoList { cpus [ 0 ] . GpuInfo }
}
2024-03-30 09:50:05 -07:00
func GetGPUInfo ( ) GpuInfoList {
2023-11-29 11:00:37 -08:00
// TODO - consider exploring lspci (and equivalent on windows) to check for
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
gpuMutex . Lock ( )
defer gpuMutex . Unlock ( )
2024-05-15 15:13:16 -07:00
needRefresh := true
2024-05-29 16:37:34 -07:00
var cHandles * cudaHandles
var oHandles * oneapiHandles
2024-03-30 15:34:21 -07:00
defer func ( ) {
2024-05-29 16:37:34 -07:00
if cHandles != nil {
if cHandles . cudart != nil {
C . cudart_release ( * cHandles . cudart )
}
if cHandles . nvcuda != nil {
C . nvcuda_release ( * cHandles . nvcuda )
}
2024-06-03 15:07:50 -07:00
if cHandles . nvml != nil {
C . nvml_release ( * cHandles . nvml )
}
2024-03-30 15:34:21 -07:00
}
2024-05-29 16:37:34 -07:00
if oHandles != nil {
if oHandles . oneapi != nil {
// TODO - is this needed?
C . oneapi_release ( * oHandles . oneapi )
}
2024-04-30 16:42:48 -07:00
}
2024-03-30 15:34:21 -07:00
} ( )
2023-11-29 11:00:37 -08:00
2024-05-15 15:13:16 -07:00
if ! bootstrapped {
2024-07-03 10:30:07 -07:00
slog . Info ( "looking for compatible GPUs" )
build: Make target improvements (#7499)
* llama: wire up builtin runner
This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build. After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.
* build: Make target improvements
Add a few new targets and help for building locally.
This also adjusts the runner lookup to favor local builds, then
runners relative to the executable, and finally payloads.
* Support customized CPU flags for runners
This implements a simplified custom CPU flags pattern for the runners.
When built without overrides, the runner name contains the vector flag
we check for (AVX) to ensure we don't try to run on unsupported systems
and crash. If the user builds a customized set, we omit the naming
scheme and don't check for compatibility. This avoids checking
requirements at runtime, so that logic has been removed as well. This
can be used to build GPU runners with no vector flags, or CPU/GPU
runners with additional flags (e.g. AVX512) enabled.
* Use relative paths
If the user checks out the repo in a path that contains spaces, make gets
really confused so use relative paths for everything in-repo to avoid breakage.
* Remove payloads from main binary
* install: clean up prior libraries
This removes support for v0.3.6 and older versions (before the tar bundle)
and ensures we clean up prior libraries before extracting the bundle(s).
Without this change, runners and dependent libraries could leak when we
update and lead to subtle runtime errors.
2024-12-10 09:47:19 -08:00
cudaComputeMajorMin , err := strconv . Atoi ( CudaComputeMajorMin )
if err != nil {
slog . Error ( "invalid CudaComputeMajorMin setting" , "value" , CudaComputeMajorMin , "error" , err )
}
cudaComputeMinorMin , err := strconv . Atoi ( CudaComputeMinorMin )
if err != nil {
slog . Error ( "invalid CudaComputeMinorMin setting" , "value" , CudaComputeMinorMin , "error" , err )
}
2024-10-14 16:26:45 -07:00
bootstrapErrors = [ ] error { }
2024-05-15 15:13:16 -07:00
needRefresh = false
var memInfo C . mem_info_t
2024-06-03 19:09:23 -07:00
mem , err := GetCPUMem ( )
if err != nil {
slog . Warn ( "error looking up system memory" , "error" , err )
2024-05-15 15:13:16 -07:00
}
2025-01-29 15:03:38 -08:00
2024-10-15 11:36:08 -07:00
details , err := GetCPUDetails ( )
if err != nil {
slog . Warn ( "failed to lookup CPU details" , "error" , err )
}
2024-08-01 14:52:15 -07:00
cpus = [ ] CPUInfo {
{
GpuInfo : GpuInfo {
2025-01-29 15:03:38 -08:00
memInfo : mem ,
Library : "cpu" ,
ID : "0" ,
2024-08-01 14:52:15 -07:00
} ,
2024-10-15 11:36:08 -07:00
CPUs : details ,
2024-05-15 15:13:16 -07:00
} ,
2024-08-01 14:52:15 -07:00
}
2024-05-15 15:13:16 -07:00
// Load ALL libraries
2024-05-29 16:37:34 -07:00
cHandles = initCudaHandles ( )
2024-05-15 15:13:16 -07:00
// NVIDIA
2024-05-29 16:37:34 -07:00
for i := range cHandles . deviceCount {
if cHandles . cudart != nil || cHandles . nvcuda != nil {
2024-05-15 15:13:16 -07:00
gpuInfo := CudaGPUInfo {
GpuInfo : GpuInfo {
Library : "cuda" ,
} ,
index : i ,
}
var driverMajor int
var driverMinor int
2024-05-29 16:37:34 -07:00
if cHandles . cudart != nil {
C . cudart_bootstrap ( * cHandles . cudart , C . int ( i ) , & memInfo )
2024-05-15 15:13:16 -07:00
} else {
2024-05-29 16:37:34 -07:00
C . nvcuda_bootstrap ( * cHandles . nvcuda , C . int ( i ) , & memInfo )
driverMajor = int ( cHandles . nvcuda . driver_major )
driverMinor = int ( cHandles . nvcuda . driver_minor )
2024-05-15 15:13:16 -07:00
}
if memInfo . err != nil {
slog . Info ( "error looking up nvidia GPU memory" , "error" , C . GoString ( memInfo . err ) )
C . free ( unsafe . Pointer ( memInfo . err ) )
continue
}
gpuInfo . TotalMemory = uint64 ( memInfo . total )
gpuInfo . FreeMemory = uint64 ( memInfo . free )
gpuInfo . ID = C . GoString ( & memInfo . gpu_id [ 0 ] )
gpuInfo . Compute = fmt . Sprintf ( "%d.%d" , memInfo . major , memInfo . minor )
2024-06-13 20:46:14 -07:00
gpuInfo . computeMajor = int ( memInfo . major )
gpuInfo . computeMinor = int ( memInfo . minor )
2024-05-15 15:13:16 -07:00
gpuInfo . MinimumMemory = cudaMinimumMemory
2024-08-23 11:21:12 -07:00
gpuInfo . DriverMajor = driverMajor
gpuInfo . DriverMinor = driverMinor
2024-08-15 14:38:14 -07:00
variant := cudaVariant ( gpuInfo )
2025-01-29 15:03:38 -08:00
// Start with our bundled libraries
if variant != "" {
variantPath := filepath . Join ( LibOllamaPath , "cuda_" + variant )
if _ , err := os . Stat ( variantPath ) ; err == nil {
// Put the variant directory first in the search path to avoid runtime linking to the wrong library
gpuInfo . DependencyPath = append ( [ ] string { variantPath } , gpuInfo . DependencyPath ... )
2024-05-30 21:54:07 -07:00
}
}
2024-05-15 15:13:16 -07:00
gpuInfo . Name = C . GoString ( & memInfo . gpu_name [ 0 ] )
2024-08-15 14:38:14 -07:00
gpuInfo . Variant = variant
2024-05-15 15:13:16 -07:00
build: Make target improvements (#7499)
* llama: wire up builtin runner
This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build. After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.
* build: Make target improvements
Add a few new targets and help for building locally.
This also adjusts the runner lookup to favor local builds, then
runners relative to the executable, and finally payloads.
* Support customized CPU flags for runners
This implements a simplified custom CPU flags pattern for the runners.
When built without overrides, the runner name contains the vector flag
we check for (AVX) to ensure we don't try to run on unsupported systems
and crash. If the user builds a customized set, we omit the naming
scheme and don't check for compatibility. This avoids checking
requirements at runtime, so that logic has been removed as well. This
can be used to build GPU runners with no vector flags, or CPU/GPU
runners with additional flags (e.g. AVX512) enabled.
* Use relative paths
If the user checks out the repo in a path that contains spaces, make gets
really confused so use relative paths for everything in-repo to avoid breakage.
* Remove payloads from main binary
* install: clean up prior libraries
This removes support for v0.3.6 and older versions (before the tar bundle)
and ensures we clean up prior libraries before extracting the bundle(s).
Without this change, runners and dependent libraries could leak when we
update and lead to subtle runtime errors.
2024-12-10 09:47:19 -08:00
if int ( memInfo . major ) < cudaComputeMajorMin || ( int ( memInfo . major ) == cudaComputeMajorMin && int ( memInfo . minor ) < cudaComputeMinorMin ) {
2024-10-14 16:26:45 -07:00
unsupportedGPUs = append ( unsupportedGPUs ,
UnsupportedGPUInfo {
GpuInfo : gpuInfo . GpuInfo ,
} )
slog . Info ( fmt . Sprintf ( "[%d] CUDA GPU is too old. Compute Capability detected: %d.%d" , i , memInfo . major , memInfo . minor ) )
continue
}
2024-07-09 10:27:53 -07:00
// query the management library as well so we can record any skew between the two
// which represents overhead on the GPU we must set aside on subsequent updates
if cHandles . nvml != nil {
2024-11-02 16:35:41 -07:00
uuid := C . CString ( gpuInfo . ID )
defer C . free ( unsafe . Pointer ( uuid ) )
C . nvml_get_free ( * cHandles . nvml , uuid , & memInfo . free , & memInfo . total , & memInfo . used )
2024-07-09 10:27:53 -07:00
if memInfo . err != nil {
slog . Warn ( "error looking up nvidia GPU memory" , "error" , C . GoString ( memInfo . err ) )
C . free ( unsafe . Pointer ( memInfo . err ) )
} else {
if memInfo . free != 0 && uint64 ( memInfo . free ) > gpuInfo . FreeMemory {
gpuInfo . OSOverhead = uint64 ( memInfo . free ) - gpuInfo . FreeMemory
slog . Info ( "detected OS VRAM overhead" ,
"id" , gpuInfo . ID ,
"library" , gpuInfo . Library ,
"compute" , gpuInfo . Compute ,
"driver" , fmt . Sprintf ( "%d.%d" , gpuInfo . DriverMajor , gpuInfo . DriverMinor ) ,
"name" , gpuInfo . Name ,
"overhead" , format . HumanBytes2 ( gpuInfo . OSOverhead ) ,
)
}
}
}
2024-05-15 15:13:16 -07:00
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
cudaGPUs = append ( cudaGPUs , gpuInfo )
2024-05-24 11:18:27 +08:00
}
2024-05-29 16:37:34 -07:00
}
// Intel
2024-07-03 17:22:13 -07:00
if envconfig . IntelGPU ( ) {
2024-06-16 20:09:05 -04:00
oHandles = initOneAPIHandles ( )
2024-08-09 11:31:38 -07:00
if oHandles != nil && oHandles . oneapi != nil {
for d := range oHandles . oneapi . num_drivers {
if oHandles . oneapi == nil {
// shouldn't happen
slog . Warn ( "nil oneapi handle with driver count" , "count" , int ( oHandles . oneapi . num_drivers ) )
continue
}
devCount := C . oneapi_get_device_count ( * oHandles . oneapi , C . int ( d ) )
for i := range devCount {
gpuInfo := OneapiGPUInfo {
GpuInfo : GpuInfo {
Library : "oneapi" ,
} ,
driverIndex : int ( d ) ,
gpuIndex : int ( i ) ,
}
// TODO - split bootstrapping from updating free memory
C . oneapi_check_vram ( * oHandles . oneapi , C . int ( d ) , i , & memInfo )
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64 ( memInfo . free ) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo . free = C . uint64_t ( totalFreeMem )
gpuInfo . TotalMemory = uint64 ( memInfo . total )
gpuInfo . FreeMemory = uint64 ( memInfo . free )
gpuInfo . ID = C . GoString ( & memInfo . gpu_id [ 0 ] )
gpuInfo . Name = C . GoString ( & memInfo . gpu_name [ 0 ] )
2025-01-29 15:03:38 -08:00
gpuInfo . DependencyPath = [ ] string { LibOllamaPath }
2024-08-09 11:31:38 -07:00
oneapiGPUs = append ( oneapiGPUs , gpuInfo )
2024-06-16 20:09:05 -04:00
}
2024-05-15 15:13:16 -07:00
}
}
}
2024-10-14 16:26:45 -07:00
rocmGPUs , err = AMDGetGPUInfo ( )
if err != nil {
bootstrapErrors = append ( bootstrapErrors , err )
}
2024-05-15 15:13:16 -07:00
bootstrapped = true
2024-07-03 10:30:07 -07:00
if len ( cudaGPUs ) == 0 && len ( rocmGPUs ) == 0 && len ( oneapiGPUs ) == 0 {
slog . Info ( "no compatible GPUs were discovered" )
}
build: Make target improvements (#7499)
* llama: wire up builtin runner
This adds a new entrypoint into the ollama CLI to run the cgo built runner.
On Mac arm64, this will have GPU support, but on all other platforms it will
be the lowest common denominator CPU build. After we fully transition
to the new Go runners more tech-debt can be removed and we can stop building
the "default" runner via make and rely on the builtin always.
* build: Make target improvements
Add a few new targets and help for building locally.
This also adjusts the runner lookup to favor local builds, then
runners relative to the executable, and finally payloads.
* Support customized CPU flags for runners
This implements a simplified custom CPU flags pattern for the runners.
When built without overrides, the runner name contains the vector flag
we check for (AVX) to ensure we don't try to run on unsupported systems
and crash. If the user builds a customized set, we omit the naming
scheme and don't check for compatibility. This avoids checking
requirements at runtime, so that logic has been removed as well. This
can be used to build GPU runners with no vector flags, or CPU/GPU
runners with additional flags (e.g. AVX512) enabled.
* Use relative paths
If the user checks out the repo in a path that contains spaces, make gets
really confused so use relative paths for everything in-repo to avoid breakage.
* Remove payloads from main binary
* install: clean up prior libraries
This removes support for v0.3.6 and older versions (before the tar bundle)
and ensures we clean up prior libraries before extracting the bundle(s).
Without this change, runners and dependent libraries could leak when we
update and lead to subtle runtime errors.
2024-12-10 09:47:19 -08:00
// TODO verify we have runners for the discovered GPUs, filter out any that aren't supported with good error messages
2024-05-15 15:13:16 -07:00
}
// For detected GPUs, load library if not loaded
// Refresh free memory usage
if needRefresh {
2024-06-03 19:09:23 -07:00
mem , err := GetCPUMem ( )
if err != nil {
slog . Warn ( "error looking up system memory" , "error" , err )
} else {
slog . Debug ( "updating system memory data" ,
slog . Group (
"before" ,
"total" , format . HumanBytes2 ( cpus [ 0 ] . TotalMemory ) ,
"free" , format . HumanBytes2 ( cpus [ 0 ] . FreeMemory ) ,
2024-07-11 16:42:57 -07:00
"free_swap" , format . HumanBytes2 ( cpus [ 0 ] . FreeSwap ) ,
2024-06-03 19:09:23 -07:00
) ,
slog . Group (
"now" ,
"total" , format . HumanBytes2 ( mem . TotalMemory ) ,
"free" , format . HumanBytes2 ( mem . FreeMemory ) ,
2024-07-11 16:42:57 -07:00
"free_swap" , format . HumanBytes2 ( mem . FreeSwap ) ,
2024-06-03 19:09:23 -07:00
) ,
)
cpus [ 0 ] . FreeMemory = mem . FreeMemory
2024-07-11 16:42:57 -07:00
cpus [ 0 ] . FreeSwap = mem . FreeSwap
2024-06-03 19:09:23 -07:00
}
2024-05-15 15:13:16 -07:00
var memInfo C . mem_info_t
2024-05-29 16:37:34 -07:00
if cHandles == nil && len ( cudaGPUs ) > 0 {
cHandles = initCudaHandles ( )
2024-05-15 15:13:16 -07:00
}
for i , gpu := range cudaGPUs {
2024-06-03 15:07:50 -07:00
if cHandles . nvml != nil {
2024-11-02 16:35:41 -07:00
uuid := C . CString ( gpu . ID )
defer C . free ( unsafe . Pointer ( uuid ) )
C . nvml_get_free ( * cHandles . nvml , uuid , & memInfo . free , & memInfo . total , & memInfo . used )
2024-06-03 15:07:50 -07:00
} else if cHandles . cudart != nil {
2024-05-29 16:37:34 -07:00
C . cudart_bootstrap ( * cHandles . cudart , C . int ( gpu . index ) , & memInfo )
2024-06-03 15:07:50 -07:00
} else if cHandles . nvcuda != nil {
C . nvcuda_get_free ( * cHandles . nvcuda , C . int ( gpu . index ) , & memInfo . free , & memInfo . total )
memInfo . used = memInfo . total - memInfo . free
2024-05-24 11:18:27 +08:00
} else {
2024-06-03 15:07:50 -07:00
// shouldn't happen
slog . Warn ( "no valid cuda library loaded to refresh vram usage" )
break
2024-05-24 11:18:27 +08:00
}
if memInfo . err != nil {
2024-05-15 15:13:16 -07:00
slog . Warn ( "error looking up nvidia GPU memory" , "error" , C . GoString ( memInfo . err ) )
2024-05-24 11:18:27 +08:00
C . free ( unsafe . Pointer ( memInfo . err ) )
continue
}
2024-05-15 15:13:16 -07:00
if memInfo . free == 0 {
slog . Warn ( "error looking up nvidia GPU memory" )
2024-05-24 11:18:27 +08:00
continue
}
2024-07-09 10:27:53 -07:00
if cHandles . nvml != nil && gpu . OSOverhead > 0 {
// When using the management library update based on recorded overhead
memInfo . free -= C . uint64_t ( gpu . OSOverhead )
}
2024-06-03 15:07:50 -07:00
slog . Debug ( "updating cuda memory data" ,
"gpu" , gpu . ID ,
"name" , gpu . Name ,
2024-07-09 10:27:53 -07:00
"overhead" , format . HumanBytes2 ( gpu . OSOverhead ) ,
2024-06-03 15:07:50 -07:00
slog . Group (
"before" ,
"total" , format . HumanBytes2 ( gpu . TotalMemory ) ,
"free" , format . HumanBytes2 ( gpu . FreeMemory ) ,
) ,
slog . Group (
"now" ,
"total" , format . HumanBytes2 ( uint64 ( memInfo . total ) ) ,
"free" , format . HumanBytes2 ( uint64 ( memInfo . free ) ) ,
"used" , format . HumanBytes2 ( uint64 ( memInfo . used ) ) ,
) ,
)
2024-05-15 15:13:16 -07:00
cudaGPUs [ i ] . FreeMemory = uint64 ( memInfo . free )
2023-12-13 17:26:47 -08:00
}
2024-05-29 16:37:34 -07:00
if oHandles == nil && len ( oneapiGPUs ) > 0 {
oHandles = initOneAPIHandles ( )
}
for i , gpu := range oneapiGPUs {
if oHandles . oneapi == nil {
// shouldn't happen
slog . Warn ( "nil oneapi handle with device count" , "count" , oHandles . deviceCount )
continue
}
C . oneapi_check_vram ( * oHandles . oneapi , C . int ( gpu . driverIndex ) , C . int ( gpu . gpuIndex ) , & memInfo )
// TODO - convert this to MinimumMemory based on testing...
var totalFreeMem float64 = float64 ( memInfo . free ) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
memInfo . free = C . uint64_t ( totalFreeMem )
oneapiGPUs [ i ] . FreeMemory = uint64 ( memInfo . free )
}
2024-06-03 19:09:23 -07:00
err = RocmGPUInfoList ( rocmGPUs ) . RefreshFreeMemory ( )
2024-05-15 15:13:16 -07:00
if err != nil {
slog . Debug ( "problem refreshing ROCm free memory" , "error" , err )
2024-06-03 08:31:48 -07:00
}
2023-12-13 17:26:47 -08:00
}
2024-03-30 09:50:05 -07:00
2024-05-15 15:13:16 -07:00
resp := [ ] GpuInfo { }
for _ , gpu := range cudaGPUs {
resp = append ( resp , gpu . GpuInfo )
}
for _ , gpu := range rocmGPUs {
resp = append ( resp , gpu . GpuInfo )
}
2024-05-29 16:37:34 -07:00
for _ , gpu := range oneapiGPUs {
resp = append ( resp , gpu . GpuInfo )
}
2024-03-30 09:50:05 -07:00
if len ( resp ) == 0 {
2024-05-15 15:13:16 -07:00
resp = append ( resp , cpus [ 0 ] . GpuInfo )
2023-11-29 11:00:37 -08:00
}
return resp
}
2024-04-30 16:42:48 -07:00
func FindGPULibs ( baseLibName string , defaultPatterns [ ] string ) [ ] string {
2024-01-10 14:39:51 -08:00
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
gpuLibPaths := [ ] string { }
2024-03-30 09:50:05 -07:00
slog . Debug ( "Searching for GPU library" , "name" , baseLibName )
2024-01-10 14:39:51 -08:00
2025-01-29 15:03:38 -08:00
// search our bundled libraries first
patterns := [ ] string { filepath . Join ( LibOllamaPath , baseLibName ) }
2024-07-08 12:50:11 -07:00
2025-01-29 15:03:38 -08:00
var ldPaths [ ] string
2024-01-10 14:39:51 -08:00
switch runtime . GOOS {
case "windows" :
2025-01-29 15:03:38 -08:00
ldPaths = strings . Split ( os . Getenv ( "PATH" ) , string ( os . PathListSeparator ) )
2024-01-10 14:39:51 -08:00
case "linux" :
2025-01-29 15:03:38 -08:00
ldPaths = strings . Split ( os . Getenv ( "LD_LIBRARY_PATH" ) , string ( os . PathListSeparator ) )
2024-01-10 14:39:51 -08:00
}
2024-07-08 12:50:11 -07:00
2025-01-29 15:03:38 -08:00
// then search the system's LD_LIBRARY_PATH
for _ , p := range ldPaths {
p , err := filepath . Abs ( p )
2024-01-10 14:39:51 -08:00
if err != nil {
continue
}
2025-01-29 15:03:38 -08:00
patterns = append ( patterns , filepath . Join ( p , baseLibName ) )
2024-01-10 14:39:51 -08:00
}
2025-01-29 15:03:38 -08:00
// finally, search the default patterns provided by the caller
2024-04-30 16:42:48 -07:00
patterns = append ( patterns , defaultPatterns ... )
2024-03-30 09:50:05 -07:00
slog . Debug ( "gpu library search" , "globs" , patterns )
2024-01-10 14:39:51 -08:00
for _ , pattern := range patterns {
2024-05-03 11:55:32 -07:00
// Nvidia PhysX known to return bogus results
if strings . Contains ( pattern , "PhysX" ) {
slog . Debug ( "skipping PhysX cuda library path" , "path" , pattern )
2024-06-13 13:17:19 -07:00
continue
2024-05-03 11:55:32 -07:00
}
2024-01-10 14:39:51 -08:00
// Ignore glob discovery errors
matches , _ := filepath . Glob ( pattern )
for _ , match := range matches {
// Resolve any links so we don't try the same lib multiple times
// and weed out any dups across globs
libPath := match
tmp := match
var err error
for ; err == nil ; tmp , err = os . Readlink ( libPath ) {
if ! filepath . IsAbs ( tmp ) {
tmp = filepath . Join ( filepath . Dir ( libPath ) , tmp )
}
libPath = tmp
}
new := true
for _ , cmp := range gpuLibPaths {
if cmp == libPath {
new = false
break
}
}
if new {
gpuLibPaths = append ( gpuLibPaths , libPath )
}
}
}
2024-03-30 09:50:05 -07:00
slog . Debug ( "discovered GPU libraries" , "paths" , gpuLibPaths )
2024-01-10 14:39:51 -08:00
return gpuLibPaths
}
2024-10-14 16:26:45 -07:00
// Bootstrap the runtime library
// Returns: num devices, handle, libPath, error
func loadCUDARTMgmt ( cudartLibPaths [ ] string ) ( int , * C . cudart_handle_t , string , error ) {
2024-03-25 11:07:44 -04:00
var resp C . cudart_init_resp_t
2024-01-22 16:03:32 -08:00
resp . ch . verbose = getVerboseState ( )
2024-10-14 16:26:45 -07:00
var err error
2024-03-25 11:07:44 -04:00
for _ , libPath := range cudartLibPaths {
2024-01-10 14:39:51 -08:00
lib := C . CString ( libPath )
defer C . free ( unsafe . Pointer ( lib ) )
2024-03-25 11:07:44 -04:00
C . cudart_init ( lib , & resp )
2024-01-10 14:39:51 -08:00
if resp . err != nil {
2024-10-14 16:26:45 -07:00
err = fmt . Errorf ( "Unable to load cudart library %s: %s" , libPath , C . GoString ( resp . err ) )
slog . Debug ( err . Error ( ) )
2024-01-10 14:39:51 -08:00
C . free ( unsafe . Pointer ( resp . err ) )
} else {
2024-10-14 16:26:45 -07:00
err = nil
return int ( resp . num_devices ) , & resp . ch , libPath , err
2024-01-10 14:39:51 -08:00
}
}
2024-10-14 16:26:45 -07:00
return 0 , nil , "" , err
2024-01-10 14:39:51 -08:00
}
2024-10-14 16:26:45 -07:00
// Bootstrap the driver library
// Returns: num devices, handle, libPath, error
func loadNVCUDAMgmt ( nvcudaLibPaths [ ] string ) ( int , * C . nvcuda_handle_t , string , error ) {
2024-04-30 16:42:48 -07:00
var resp C . nvcuda_init_resp_t
resp . ch . verbose = getVerboseState ( )
2024-10-14 16:26:45 -07:00
var err error
2024-04-30 16:42:48 -07:00
for _ , libPath := range nvcudaLibPaths {
lib := C . CString ( libPath )
defer C . free ( unsafe . Pointer ( lib ) )
C . nvcuda_init ( lib , & resp )
if resp . err != nil {
2024-07-03 10:30:07 -07:00
// Decide what log level based on the type of error message to help users understand why
switch resp . cudaErr {
case C . CUDA_ERROR_INSUFFICIENT_DRIVER , C . CUDA_ERROR_SYSTEM_DRIVER_MISMATCH :
2024-10-14 16:26:45 -07:00
err = fmt . Errorf ( "version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s" , libPath )
slog . Warn ( err . Error ( ) )
2024-07-03 10:30:07 -07:00
case C . CUDA_ERROR_NO_DEVICE :
2024-10-14 16:26:45 -07:00
err = fmt . Errorf ( "no nvidia devices detected by library %s" , libPath )
slog . Info ( err . Error ( ) )
2024-07-03 10:30:07 -07:00
case C . CUDA_ERROR_UNKNOWN :
2024-10-14 16:26:45 -07:00
err = fmt . Errorf ( "unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information" , libPath , C . GoString ( resp . err ) )
slog . Warn ( err . Error ( ) )
2024-07-03 10:30:07 -07:00
default :
2024-10-14 16:26:45 -07:00
msg := C . GoString ( resp . err )
2024-07-03 10:30:07 -07:00
if strings . Contains ( msg , "wrong ELF class" ) {
slog . Debug ( "skipping 32bit library" , "library" , libPath )
} else {
2024-10-14 16:26:45 -07:00
err = fmt . Errorf ( "Unable to load cudart library %s: %s" , libPath , C . GoString ( resp . err ) )
slog . Info ( err . Error ( ) )
2024-07-03 10:30:07 -07:00
}
}
2024-04-30 16:42:48 -07:00
C . free ( unsafe . Pointer ( resp . err ) )
} else {
2024-10-14 16:26:45 -07:00
err = nil
return int ( resp . num_devices ) , & resp . ch , libPath , err
2024-04-30 16:42:48 -07:00
}
}
2024-10-14 16:26:45 -07:00
return 0 , nil , "" , err
2024-04-30 16:42:48 -07:00
}
2024-10-14 16:26:45 -07:00
// Bootstrap the management library
// Returns: handle, libPath, error
func loadNVMLMgmt ( nvmlLibPaths [ ] string ) ( * C . nvml_handle_t , string , error ) {
2024-06-03 15:07:50 -07:00
var resp C . nvml_init_resp_t
resp . ch . verbose = getVerboseState ( )
2024-10-14 16:26:45 -07:00
var err error
2024-06-03 15:07:50 -07:00
for _ , libPath := range nvmlLibPaths {
lib := C . CString ( libPath )
defer C . free ( unsafe . Pointer ( lib ) )
C . nvml_init ( lib , & resp )
if resp . err != nil {
2024-10-14 16:26:45 -07:00
err = fmt . Errorf ( "Unable to load NVML management library %s: %s" , libPath , C . GoString ( resp . err ) )
slog . Info ( err . Error ( ) )
2024-06-03 15:07:50 -07:00
C . free ( unsafe . Pointer ( resp . err ) )
} else {
2024-10-14 16:26:45 -07:00
err = nil
return & resp . ch , libPath , err
2024-06-03 15:07:50 -07:00
}
}
2024-10-14 16:26:45 -07:00
return nil , "" , err
2024-06-03 15:07:50 -07:00
}
2024-10-14 16:26:45 -07:00
// bootstrap the Intel GPU library
// Returns: num devices, handle, libPath, error
func loadOneapiMgmt ( oneapiLibPaths [ ] string ) ( int , * C . oneapi_handle_t , string , error ) {
2024-05-24 11:18:27 +08:00
var resp C . oneapi_init_resp_t
2024-05-29 16:37:34 -07:00
num_devices := 0
2024-05-24 11:18:27 +08:00
resp . oh . verbose = getVerboseState ( )
2024-10-14 16:26:45 -07:00
var err error
2024-05-24 11:18:27 +08:00
for _ , libPath := range oneapiLibPaths {
lib := C . CString ( libPath )
defer C . free ( unsafe . Pointer ( lib ) )
C . oneapi_init ( lib , & resp )
if resp . err != nil {
2024-10-14 16:26:45 -07:00
err = fmt . Errorf ( "Unable to load oneAPI management library %s: %s" , libPath , C . GoString ( resp . err ) )
slog . Debug ( err . Error ( ) )
2024-05-24 11:18:27 +08:00
C . free ( unsafe . Pointer ( resp . err ) )
} else {
2024-10-14 16:26:45 -07:00
err = nil
2024-06-05 12:07:20 -07:00
for i := range resp . oh . num_drivers {
2024-05-29 16:37:34 -07:00
num_devices += int ( C . oneapi_get_device_count ( resp . oh , C . int ( i ) ) )
}
2024-10-14 16:26:45 -07:00
return num_devices , & resp . oh , libPath , err
2024-05-24 11:18:27 +08:00
}
}
2024-10-14 16:26:45 -07:00
return 0 , nil , "" , err
2024-05-24 11:18:27 +08:00
}
2024-01-22 16:03:32 -08:00
func getVerboseState ( ) C . uint16_t {
2024-07-03 16:00:54 -07:00
if envconfig . Debug ( ) {
2024-01-22 16:03:32 -08:00
return C . uint16_t ( 1 )
}
return C . uint16_t ( 0 )
}
2024-03-30 09:50:05 -07:00
// Given the list of GPUs this instantiation is targeted for,
// figure out the visible devices environment variable
//
// If different libraries are detected, the first one is what we use
func ( l GpuInfoList ) GetVisibleDevicesEnv ( ) ( string , string ) {
if len ( l ) == 0 {
return "" , ""
}
switch l [ 0 ] . Library {
case "cuda" :
return cudaGetVisibleDevicesEnv ( l )
case "rocm" :
return rocmGetVisibleDevicesEnv ( l )
2024-05-24 11:18:27 +08:00
case "oneapi" :
return oneapiGetVisibleDevicesEnv ( l )
2024-03-30 09:50:05 -07:00
default :
slog . Debug ( "no filter required for library " + l [ 0 ] . Library )
return "" , ""
}
}
2024-07-08 12:50:11 -07:00
2024-10-14 16:26:45 -07:00
func GetSystemInfo ( ) SystemInfo {
gpus := GetGPUInfo ( )
gpuMutex . Lock ( )
defer gpuMutex . Unlock ( )
discoveryErrors := [ ] string { }
for _ , err := range bootstrapErrors {
discoveryErrors = append ( discoveryErrors , err . Error ( ) )
}
if len ( gpus ) == 1 && gpus [ 0 ] . Library == "cpu" {
gpus = [ ] GpuInfo { }
}
return SystemInfo {
System : cpus [ 0 ] ,
GPUs : gpus ,
UnsupportedGPUs : unsupportedGPUs ,
DiscoveryErrors : discoveryErrors ,
}
}