diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 7d6831eed0..243476891a 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -356,23 +356,25 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } // Mimic llama runner logs summarizing layers and memory - slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", max(0, params.NumGPULayers-1))) gpuLayers := 0 - switch C.ggml_backend_dev_type(output.d) { - case 0: // CPU - slog.Info("offloading output layer to CPU") - case 1: // GPU - slog.Info("offloading output layer to GPU") - gpuLayers++ - case 2: // ACCEL - slog.Info("offloading output layer to ACCEL") - } for _, layer := range layers { - if C.ggml_backend_dev_type(layer.d) == 1 { + if C.ggml_backend_dev_type(layer.d) == C.GGML_BACKEND_DEVICE_TYPE_GPU { gpuLayers++ } } + slog.Info(fmt.Sprintf("offloading %d repeating layers to GPU", gpuLayers)) + + switch C.ggml_backend_dev_type(output.d) { + case C.GGML_BACKEND_DEVICE_TYPE_CPU: + slog.Info("offloading output layer to CPU") + case C.GGML_BACKEND_DEVICE_TYPE_GPU: + slog.Info("offloading output layer to GPU") + gpuLayers++ + case C.GGML_BACKEND_DEVICE_TYPE_ACCEL: + slog.Info("offloading output layer to ACCEL") + } slog.Info(fmt.Sprintf("offloaded %d/%d layers to GPU", gpuLayers, len(layers)+1)) + for bs := range maps.Values(bbs) { slog.Info("model weights", "buffer", C.GoString(C.ggml_backend_buffer_name(bs)), "size", format.HumanBytes2(uint64(C.ggml_backend_buffer_get_size(bs)))) }