mirror of
https://github.com/ollama/ollama.git
synced 2025-03-18 05:41:43 +01:00
cuda: enable flash attention
ggml added an option to disable flash attention so explicitly enable it
This commit is contained in:
parent
25885e5335
commit
b42aba40ed
@ -23,6 +23,7 @@ set(GGML_SCHED_MAX_COPIES 4)
|
||||
set(GGML_LLAMAFILE ON)
|
||||
set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
|
||||
set(GGML_CUDA_GRAPHS ON)
|
||||
set(GGML_CUDA_FA ON)
|
||||
|
||||
if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
|
||||
OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
|
||||
|
Loading…
x
Reference in New Issue
Block a user