From b42aba40ed21862532b9d195a28915a1fca88560 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 28 Feb 2025 11:28:26 -0800 Subject: [PATCH] cuda: enable flash attention ggml added an option to disable flash attention so explicitly enable it --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 63b804582..92b1793b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ set(GGML_SCHED_MAX_COPIES 4) set(GGML_LLAMAFILE ON) set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128) set(GGML_CUDA_GRAPHS ON) +set(GGML_CUDA_FA ON) if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))