mirror of
https://github.com/ollama/ollama.git
synced 2025-08-24 20:11:21 +02:00
discover: CPU supports flash attention
We already run flash attention on CPUs in cases where we have partial offloading but were disabling it if running on pure CPU, which is unnecessary.
This commit is contained in:
@@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int {
|
||||
// For each GPU, check if it does NOT support flash attention
|
||||
func (l GpuInfoList) FlashAttentionSupported() bool {
|
||||
for _, gpu := range l {
|
||||
supportsFA := gpu.Library == "metal" ||
|
||||
supportsFA := gpu.Library == "cpu" ||
|
||||
gpu.Library == "metal" ||
|
||||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
|
||||
gpu.Library == "rocm"
|
||||
|
||||
|
Reference in New Issue
Block a user