build: recursive make ggml-cuda

This commit is contained in:
Michael Yang 2024-12-09 15:38:39 -08:00
parent 9f7d3ff2a8
commit 054e31146e
11 changed files with 195 additions and 62 deletions

112
Makefile2 Normal file
View File

@ -0,0 +1,112 @@
export GOOS?=$(shell go env GOOS)
export GOARCH?=$(shell go env GOARCH)
build: llama/build/$(GOOS)-$(GOARCH)
export GOFLAGS=-trimpath
llama/build/%/runners/metal: GOFLAGS+=-tags=metal
llama/build/%/runners/cpu_avx: GOFLAGS+=-tags=avx
llama/build/%/runners/cpu_avx2: GOFLAGS+=-tags=avx2
llama/build/%/runners/cuda_v11: GOFLAGS+=-tags=cuda,cuda_v11
llama/build/%/runners/cuda_v12: GOFLAGS+=-tags=cuda,cuda_v12
llama/build/%/runners/rocm: GOFLAGS+=-tags=cuda,rocm
.PHONY: llama/build/darwin-amd64 llama/build/darwin-arm64
llama/build/darwin-amd64: llama/build/darwin-amd64/runners/cpu_avx
llama/build/darwin-arm64: llama/build/darwin-arm64/runners/metal
.PHONY: llama/build/linux-amd64 llama/build/linux-arm64
llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx
llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx2
llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx
llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx2
.PHONY: llama/build/windows-amd64 linux/build/windows-arm64
llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx
llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx2
llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx
llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx2
.PHONY: cuda_v11 cuda_v12
cuda_v11 cuda_v12 rocm:
$(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $@
ifeq ($(GOOS),linux)
NVCC11=$(shell command -v /usr/local/cuda-11/bin/nvcc)
NVCC12=$(shell command -v /usr/local/cuda-12/bin/nvcc)
HIPCC=$(shell command -v hipcc)
else ifeq ($(GOOS),windows)
NVCC11=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.?\\bin\\nvcc.exe")
NVCC12=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.?\\bin\\nvcc.exe")
HIPCC=$(shell command -v hipcc)
endif
ifneq ($(NVCC11),)
ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_11),)
CUDA_V11_TARGETS= \
llama/build/linux-amd64/runners/cuda_v11 \
llama/build/linux-arm64/runners/cuda_v11 \
llama/build/windows-amd64/runners/cuda_v11 \
llama/build/windows-arm64/runners/cuda_v11 \
$(CUDA_V11_TARGETS): cuda_v11
cuda_v11: export NVCC=$(NVCC11)
llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v11
llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v11
llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v11
llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v11
endif
endif
ifneq ($(NVCC12),)
ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_12),)
CUDA_V12_TARGETS= \
llama/build/linux-amd64/runners/cuda_v12 \
llama/build/linux-arm64/runners/cuda_v12 \
llama/build/windows-amd64/runners/cuda_v12 \
llama/build/windows-arm64/runners/cuda_v12 \
$(CUDA_V12_TARGETS): cuda_v12
cuda_v12: export NVCC=$(NVCC12)
llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v12
llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v12
llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v12
llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v12
endif
endif
ifneq ($(HIPCC),)
ifeq ($(OLLAMA_SKIP_GENERATE_ROCM),)
ROCM_TARGETS= \
llama/build/linux-amd64/runners/rocm \
llama/build/linux-arm64/runners/rocm \
llama/build/windows-amd64/runners/rocm \
llama/build/windows-arm64/runners/rocm \
$(ROCM_TARGETS): rocm
rocm: export NVCC=$(HIPCC)
llama/build/linux-amd64: llama/build/linux-amd64/runners/rocm
llama/build/linux-arm64: llama/build/linux-arm64/runners/rocm
llama/build/windows-amd64: llama/build/windows-amd64/runners/rocm
llama/build/windows-arm64: llama/build/windows-arm64/runners/rocm
endif
endif
export CGO_ENABLED=1
export CGO_CPPFLAGS_ALLOW=-mfma|-mf16c
llama/build/%: cmd/runner always
mkdir -p $@; go build -o $@ ./$<
.PHONY: always
always:
clean:
$(RM) -r llama/build
realclean: clean
$(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $<

View File

@ -37,8 +37,7 @@ go build -tags avx .
```shell
# go doesn't recognize `-mfma` as a valid compiler flag
# see https://github.com/golang/go/issues/17895
go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c"
go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
go build -tags=avx,avx2 .
```

View File

@ -94,4 +94,3 @@ CPPFLAGS += \
-I../ml/backend/ggml/ggml/include \
-I../ml/backend/ggml/ggml/ggml-cpu \
-I../ml/backend/ggml/ggml/ggml-cpu/amx \
>>>>>>> 22320f09 (preserve vendor directory structure):llama/make/common-defs.make

View File

@ -11,74 +11,31 @@ GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS
# today, cuda is bundled, but rocm is split out. Should split them each out by runner
DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
GPU_RUNNER_SRCS := \
$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-aarch64.c llama/ggml-threading.cpp
GPU_RUNNER_HDRS := \
$(wildcard llama/ggml-cuda/*.cuh)
# Conditional flags and components to speed up developer builds
ifneq ($(OLLAMA_FAST_BUILD),)
GPU_COMPILER_CUFLAGS += \
-DGGML_DISABLE_FLASH_ATTN
else
GPU_RUNNER_SRCS += \
$(wildcard llama/ggml-cuda/fattn*.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
endif
GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS)
$(GPU_RUNNER_NAME): $(BUILD_RUNNERS)
dist: $(DIST_RUNNERS)
# Build targets
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(dir $@)
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
@-mkdir -p $(@D)
$(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $(GPU_RUNNER_NAME) CXX=$(GPU_COMPILER)
GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
@-mkdir -p $(dir $@)
$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
# Distribution targets
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
@-mkdir -p $(dir $@)
@-mkdir -p $(@D)
$(CP) $< $@
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
@-mkdir -p $(dir $@)
$(CP) $< $@
$(GPU_DIST_LIB_DEPS):
@-mkdir -p $(dir $@)
$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
@-mkdir -p $(@D)
$(CP) $(GPU_LIB_DIR)/$(@F) $(@D)
clean:
rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)
clean:
$(RM) $(BUILD_RUNNERS) $(DIST_RUNNERS)
.PHONY: clean $(GPU_RUNNER_NAME)

View File

@ -4,8 +4,7 @@ package cpu
// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../include -I${SRCDIR}/amx
// #cgo CPPFLAGS: -D_GNU_SOURCE
// #cgo amd64,avx CPPFLAGS: -mavx
// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma
// #cgo amd64,f16c CPPFLAGS: -mf16c
// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c
// #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
import "C"
import (

View File

@ -0,0 +1 @@
*.o

View File

@ -0,0 +1,64 @@
NVCC?=nvcc
NVCC_PREPEND_FLAGS= \
-t 2 \
-split-compile 0 \
-std=c++17 \
-I.. \
-I../include \
$(foreach ARCH,$(subst ;, ,$(CUDA_ARCHS)),--generate-code=arch=compute_$(ARCH),code=sm_$(ARCH)) \
NVCC_APPEND_FLAGS= \
# -DGGML_CUDA_USE_GRAPHS=1 \
ALL_SOURCES=$(wildcard *.cu)
FATTN_SOURCES=$(wildcard fattn*.cu)
SOURCES= \
$(filter-out $(FATTN_SOURCES),$(ALL_SOURCES)) \
$(wildcard template-instances/mmq*.cu) \
ifneq ($(OLLAMA_FAST_BUILD),)
NVCC_APPEND_FLAGS+=-DGGML_DISABLE_FLASH_ATTN
else
SOURCES+= \
$(FATTN_SOURCES) \
$(wildcard template-instances/fattn-wmma*.cu) \
$(wildcard template-instances/fattn-vec*q4_0-q4_0.cu) \
$(wildcard template-instances/fattn-vec*q8_0-q8_0.cu) \
$(wildcard template-instances/fattn-vec*f16-f16.cu)
endif
all: cuda_v11 cuda_v12
cuda_v11: CUDA_ARCHS?=50;52;53;60;61;62;70;72;75;80;86
cuda_v11: OBJECTS=$(patsubst %.cu,%.v11.o,$(SOURCES))
cuda_v12: CUDA_ARCHS?=60;61;62;70;72;75;80;86;87;89;90;90a
cuda_v12: OBJECTS=$(patsubst %.cu,%.v12.o,$(SOURCES))
rocm: CPPFLAGS+=-DGGML_USE_HIP
rocm: OBJECTS=$(patsubst %.cu,%.rocm.o,$(SOURCES))
ifeq ($(OS),Windows_NT)
cuda_v11: libggml_cuda_v11.dll
cuda_v12: libggml_cuda_v12.dll
rocm: libggml_rocm.dll
else
cuda_v11: libggml_cuda_v11.a
cuda_v12: libggml_cuda_v12.a
rocm: libggml_rocm.a
endif
clean:
$(RM) *.a *.o template-instances/*.o
%.v11.o %.v12.o %.rocm.o: %.cu
$(NVCC) $(NVCC_PREPEND_FLAGS) -c $< -o $@ $(NVCC_APPEND_FLAGS)
.SECONDEXPANSION:
%.a: $$(OBJECTS)
$(AR) rcs $@ $^
%.dll: $$(OBJECTS)
$(NVCC) -shared -o $@ $^

View File

@ -1,3 +1,7 @@
package cuda
// #cgo cuda_v11 LDFLAGS: -L. -lggml_cuda_v11
// #cgo cuda_v12 LDFLAGS: -L. -lggml_cuda_v12
// #cgo cuda_v11 cuda_v12 LDFLAGS: -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublasLt
// #cgo rocm LDFLAGS: -L. -lggml_rocm -L/opt/rocm/lib -lhipblas -lamdhip64 -lrocblas
import "C"

View File

@ -5,8 +5,7 @@ package ggml
// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
// #cgo darwin LDFLAGS: -framework Foundation
// #cgo amd64,avx CPPFLAGS: -mavx
// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma
// #cgo amd64,f16c CPPFLAGS: -mf16c
// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c
// #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
import "C"
import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cpu"

View File

@ -3,5 +3,6 @@
package ggml
// #cgo CPPFLAGS: -DGGML_USE_CUDA
// #cgo rocm CPPFLAGS: -DGGML_USE_HIP
import "C"
import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cuda"

View File

@ -10,9 +10,7 @@ mkdir -p dist
# If installed to an alternate location use the following to enable
# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
export CGO_CFLAGS=-mmacosx-version-min=11.3
export CGO_CXXFLAGS=-mmacosx-version-min=11.3
export CGO_LDFLAGS=-mmacosx-version-min=11.3
export CGO_CPPFLAGS=-mmacosx-version-min=11.3
rm -rf llama/build dist/darwin-*
echo "Building darwin arm64"