From dcfb7a105c455ae8d44a06b3380731d8b1ffcc22 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 29 Jan 2025 15:03:38 -0800
Subject: [PATCH] next build (#8539)

* add build to .dockerignore

* test: only build one arch

* add build to .gitignore

* fix ccache path

* filter amdgpu targets

* only filter if autodetecting

* Don't clobber gpu list for default runner

This ensures the GPU specific environment variables are set properly

* explicitly set CXX compiler for HIP

* Update build_windows.ps1

This isn't complete, but is close.  Dependencies are missing, and it only builds the "default" preset.

* build: add ollama subdir

* add .git to .dockerignore

* docs: update development.md

* update build_darwin.sh

* remove unused scripts

* llm: add cwd and build/lib/ollama to library paths

* default DYLD_LIBRARY_PATH to LD_LIBRARY_PATH in runner on macOS

* add additional cmake output vars for msvc

* interim edits to make server detection logic work with dll directories like lib/ollama/cuda_v12

* remove unncessary filepath.Dir, cleanup

* add hardware-specific directory to path

* use absolute server path

* build: linux arm

* cmake install targets

* remove unused files

* ml: visit each library path once

* build: skip cpu variants on arm

* build: install cpu targets

* build: fix workflow

* shorter names

* fix rocblas install

* docs: clean up development.md

* consistent build dir removal in development.md

* silence -Wimplicit-function-declaration build warnings in ggml-cpu

* update readme

* update development readme

* llm: update library lookup logic now that there is one runner (#8587)

* tweak development.md

* update docs

* add windows cuda/rocm tests

---------

Co-authored-by: jmorganca <jmorganca@gmail.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
---
 .dockerignore                                 |   4 +-
 .gitattributes                                |   9 +
 .github/workflows/release.yaml                | 929 ++++++------------
 .github/workflows/test.yaml                   | 358 ++-----
 .gitignore                                    |   5 +-
 CMakeLists.txt                                | 112 +++
 CMakePresets.json                             | 110 +++
 Dockerfile                                    | 281 ++----
 Makefile                                      | 103 --
 Makefile.sync                                 |  56 ++
 discover/amd_common.go                        |  13 +-
 discover/amd_linux.go                         |   6 +-
 discover/amd_windows.go                       |  10 +-
 discover/gpu.go                               |  88 +-
 discover/gpu_darwin.go                        |   3 -
 discover/path.go                              |  53 +
 discover/types.go                             |   3 +-
 docs/development.md                           | 205 ++--
 envconfig/config.go                           |   9 -
 go.mod                                        |   3 +-
 go.sum                                        |   2 +
 llama/README.md                               | 127 +--
 llama/amx.h                                   |  34 -
 llama/ggml-blas.h                             |  51 -
 llama/ggml-cpu-aarch64.h                      |  34 -
 llama/ggml-cpu-traits.h                       |  64 --
 llama/ggml-cuda/acc.cuh                       |  31 -
 llama/ggml-cuda/arange.cu                     |  60 --
 llama/ggml-cuda/arange.cuh                    |  31 -
 llama/ggml-cuda/argmax.cuh                    |  29 -
 llama/ggml-cuda/argsort.cuh                   |  29 -
 llama/ggml-cuda/binbcast.cuh                  |  35 -
 llama/ggml-cuda/clamp.cu                      |  60 --
 llama/ggml-cuda/clamp.cuh                     |  31 -
 llama/ggml-cuda/concat.cuh                    |  31 -
 llama/ggml-cuda/conv-transpose-1d.cuh         |  31 -
 llama/ggml-cuda/convert.cuh                   |  39 -
 llama/ggml-cuda/count-equal.cuh               |  31 -
 llama/ggml-cuda/cpy.cuh                       |  35 -
 llama/ggml-cuda/cross-entropy-loss.cuh        |  33 -
 llama/ggml-cuda/diagmask.cuh                  |  31 -
 llama/ggml-cuda/fattn-tile-f16.cuh            |  29 -
 llama/ggml-cuda/fattn-tile-f32.cuh            |  29 -
 llama/ggml-cuda/fattn.cuh                     |  29 -
 llama/ggml-cuda/getrows.cuh                   |  31 -
 llama/ggml-cuda/im2col.cuh                    |  31 -
 llama/ggml-cuda/mmv.cuh                       |  38 -
 llama/ggml-cuda/mmvq.cuh                      |  35 -
 llama/ggml-cuda/norm.cuh                      |  33 -
 llama/ggml-cuda/opt-step-adamw.cuh            |  31 -
 llama/ggml-cuda/out-prod.cuh                  |  29 -
 llama/ggml-cuda/pad.cuh                       |  32 -
 llama/ggml-cuda/pool2d.cuh                    |  31 -
 llama/ggml-cuda/quantize.cuh                  |  50 -
 llama/ggml-cuda/rope.cuh                      |  31 -
 llama/ggml-cuda/scale.cu                      |  57 --
 llama/ggml-cuda/scale.cuh                     |  31 -
 llama/ggml-cuda/softmax.cuh                   |  31 -
 llama/ggml-cuda/sum.cuh                       |  31 -
 llama/ggml-cuda/sumrows.cu                    |  65 --
 llama/ggml-cuda/sumrows.cuh                   |  31 -
 .../fattn-vec-f16-instance-hs128-f16-f16.cu   |  31 -
 .../fattn-vec-f16-instance-hs128-f16-q4_0.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-f16-q4_1.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-f16-q5_0.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-f16-q5_1.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-f16-q8_0.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-q4_0-f16.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_1-f16.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_0-f16.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_1-f16.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q8_0-f16.cu  |  31 -
 .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu |  31 -
 .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu |  31 -
 .../fattn-vec-f16-instance-hs256-f16-f16.cu   |  31 -
 .../fattn-vec-f16-instance-hs64-f16-f16.cu    |  31 -
 .../fattn-vec-f16-instance-hs64-f16-q4_0.cu   |  31 -
 .../fattn-vec-f16-instance-hs64-f16-q4_1.cu   |  31 -
 .../fattn-vec-f16-instance-hs64-f16-q5_0.cu   |  31 -
 .../fattn-vec-f16-instance-hs64-f16-q5_1.cu   |  31 -
 .../fattn-vec-f16-instance-hs64-f16-q8_0.cu   |  31 -
 .../fattn-vec-f32-instance-hs128-f16-f16.cu   |  31 -
 .../fattn-vec-f32-instance-hs128-f16-q4_0.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-f16-q4_1.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-f16-q5_0.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-f16-q5_1.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-f16-q8_0.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-q4_0-f16.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_1-f16.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_0-f16.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_1-f16.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q8_0-f16.cu  |  31 -
 .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu |  31 -
 .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu |  31 -
 .../fattn-vec-f32-instance-hs256-f16-f16.cu   |  31 -
 .../fattn-vec-f32-instance-hs64-f16-f16.cu    |  31 -
 .../fattn-vec-f32-instance-hs64-f16-q4_0.cu   |  31 -
 .../fattn-vec-f32-instance-hs64-f16-q4_1.cu   |  31 -
 .../fattn-vec-f32-instance-hs64-f16-q5_0.cu   |  31 -
 .../fattn-vec-f32-instance-hs64-f16-q5_1.cu   |  31 -
 .../fattn-vec-f32-instance-hs64-f16-q8_0.cu   |  31 -
 .../fattn-wmma-f16-instance-kqfloat-cpb16.cu  |  36 -
 .../fattn-wmma-f16-instance-kqfloat-cpb32.cu  |  35 -
 .../fattn-wmma-f16-instance-kqhalf-cpb16.cu   |  36 -
 .../fattn-wmma-f16-instance-kqhalf-cpb32.cu   |  36 -
 .../fattn-wmma-f16-instance-kqhalf-cpb8.cu    |  34 -
 .../template-instances/mmq-instance-iq1_s.cu  |  31 -
 .../template-instances/mmq-instance-iq2_s.cu  |  31 -
 .../template-instances/mmq-instance-iq2_xs.cu |  31 -
 .../mmq-instance-iq2_xxs.cu                   |  31 -
 .../template-instances/mmq-instance-iq3_s.cu  |  31 -
 .../mmq-instance-iq3_xxs.cu                   |  31 -
 .../template-instances/mmq-instance-iq4_nl.cu |  31 -
 .../template-instances/mmq-instance-iq4_xs.cu |  31 -
 .../template-instances/mmq-instance-q2_k.cu   |  31 -
 .../template-instances/mmq-instance-q3_k.cu   |  31 -
 .../template-instances/mmq-instance-q4_0.cu   |  31 -
 .../template-instances/mmq-instance-q4_1.cu   |  31 -
 .../template-instances/mmq-instance-q4_k.cu   |  31 -
 .../template-instances/mmq-instance-q5_0.cu   |  31 -
 .../template-instances/mmq-instance-q5_1.cu   |  31 -
 .../template-instances/mmq-instance-q5_k.cu   |  31 -
 .../template-instances/mmq-instance-q6_k.cu   |  31 -
 .../template-instances/mmq-instance-q8_0.cu   |  31 -
 llama/ggml-cuda/tsembd.cuh                    |  31 -
 llama/ggml-cuda/upscale.cuh                   |  31 -
 llama/ggml-cuda/vendors/cuda.h                |  41 -
 llama/ggml-cuda/wkv6.cuh                      |  31 -
 llama/ggml-threading.cpp                      |  38 -
 llama/ggml-threading.h                        |  40 -
 llama/json-schema-to-grammar.h                |  34 -
 llama/llama-cparams.cpp                       |  27 -
 llama/llama-cparams.h                         |  64 --
 llama/llama-cpp.h                             |  56 --
 llama/llama-quant.h                           |  27 -
 llama/llama.cpp/.rsync-filter                 |  22 +
 llama/llama.cpp/LICENSE                       |  21 +
 llama/{ => llama.cpp/common}/base64.hpp       |   0
 llama/{ => llama.cpp/common}/common.cpp       |  26 -
 llama/llama.cpp/common/common.go              |   6 +
 llama/{ => llama.cpp/common}/common.h         |  26 -
 .../common}/json-schema-to-grammar.cpp        |  26 -
 .../llama.cpp/common/json-schema-to-grammar.h |   8 +
 llama/{ => llama.cpp/common}/json.hpp         |   0
 llama/{ => llama.cpp/common}/log.cpp          |  26 -
 llama/{ => llama.cpp/common}/log.h            |  26 -
 llama/{ => llama.cpp/common}/sampling.cpp     |  26 -
 llama/{ => llama.cpp/common}/sampling.h       |  26 -
 llama/{ => llama.cpp/common}/stb_image.h      |   0
 llama/{ => llama.cpp/examples/llava}/clip.cpp |  26 -
 llama/{ => llama.cpp/examples/llava}/clip.h   |  26 -
 .../{ => llama.cpp/examples/llava}/llava.cpp  |  26 -
 llama/llama.cpp/examples/llava/llava.go       |   6 +
 llama/{ => llama.cpp/examples/llava}/llava.h  |  26 -
 llama/llama.cpp/include/llama-cpp.h           |  30 +
 llama/{ => llama.cpp/include}/llama.h         |  26 -
 llama/{ => llama.cpp/src}/llama-adapter.cpp   |  26 -
 llama/{ => llama.cpp/src}/llama-adapter.h     |  26 -
 llama/{ => llama.cpp/src}/llama-arch.cpp      |  26 -
 llama/{ => llama.cpp/src}/llama-arch.h        |  26 -
 llama/{ => llama.cpp/src}/llama-batch.cpp     |  26 -
 llama/{ => llama.cpp/src}/llama-batch.h       |  26 -
 llama/{ => llama.cpp/src}/llama-chat.cpp      |  26 -
 llama/{ => llama.cpp/src}/llama-chat.h        |  26 -
 llama/{ => llama.cpp/src}/llama-context.cpp   |  26 -
 llama/{ => llama.cpp/src}/llama-context.h     |  26 -
 llama/llama.cpp/src/llama-cparams.cpp         |   1 +
 llama/llama.cpp/src/llama-cparams.h           |  38 +
 llama/{ => llama.cpp/src}/llama-grammar.cpp   |  26 -
 llama/{ => llama.cpp/src}/llama-grammar.h     |  26 -
 llama/{ => llama.cpp/src}/llama-hparams.cpp   |  26 -
 llama/{ => llama.cpp/src}/llama-hparams.h     |  26 -
 llama/{ => llama.cpp/src}/llama-impl.cpp      |  26 -
 llama/{ => llama.cpp/src}/llama-impl.h        |  26 -
 llama/{ => llama.cpp/src}/llama-kv-cache.cpp  |  26 -
 llama/{ => llama.cpp/src}/llama-kv-cache.h    |  26 -
 llama/{ => llama.cpp/src}/llama-mmap.cpp      |  26 -
 llama/{ => llama.cpp/src}/llama-mmap.h        |  26 -
 .../src}/llama-model-loader.cpp               |  26 -
 .../{ => llama.cpp/src}/llama-model-loader.h  |  26 -
 llama/{ => llama.cpp/src}/llama-model.cpp     |  26 -
 llama/{ => llama.cpp/src}/llama-model.h       |  26 -
 llama/{ => llama.cpp/src}/llama-quant.cpp     |  26 -
 llama/llama.cpp/src/llama-quant.h             |   1 +
 llama/{ => llama.cpp/src}/llama-sampling.cpp  |  26 -
 llama/{ => llama.cpp/src}/llama-sampling.h    |  26 -
 llama/{ => llama.cpp/src}/llama-vocab.cpp     |  26 -
 llama/{ => llama.cpp/src}/llama-vocab.h       |  26 -
 llama/{ => llama.cpp/src}/llama.cpp           |  26 -
 llama/llama.cpp/src/llama.go                  |   8 +
 llama/{ => llama.cpp/src}/unicode-data.cpp    |  26 -
 llama/llama.cpp/src/unicode-data.h            |  20 +
 llama/{ => llama.cpp/src}/unicode.cpp         |  26 -
 llama/{ => llama.cpp/src}/unicode.h           |  26 -
 llama/llama.go                                |  74 +-
 llama/mmq.h                                   |  36 -
 llama/patches/0001-cuda.patch                 |  39 +-
 llama/patches/0006-conditional-fattn.patch    |   4 +-
 ...rt.patch => 0007-add-mllama-support.patch} |   0
 llama/patches/0007-blas.patch                 |  26 -
 ...or.patch => 0008-add-unpad-operator.patch} |  12 +-
 ... => 0009-fix-deepseek-deseret-regex.patch} |   0
 ...tain-ordering-for-rules-for-grammar.patch} |   0
 ...ing-arg-in-static-assert-on-windows.patch} |   0
 .../patches/0011-relative-include-paths.patch |  51 -
 ...sure-KV-cache-is-fully-defragmented.patch} |   0
 ...atch => 0013-re-enable-gpu-for-clip.patch} |   0
 .../patches/0014-sort-devices-by-score.patch  |  82 ++
 ...target-ggml-cpu-for-all-cpu-variants.patch |  29 +
 llama/sgemm.h                                 |  14 -
 llama/unicode-data.h                          |  46 -
 llm/server.go                                 | 138 +--
 macapp/forge.config.ts                        |   6 +-
 make/Makefile.cpu                             |  40 -
 make/Makefile.cuda_v11                        |  13 -
 make/Makefile.cuda_v12                        |  13 -
 make/Makefile.ollama                          |  19 -
 make/Makefile.rocm                            | 119 ---
 make/Makefile.sync                            | 250 -----
 make/Makefile.test                            |  19 -
 make/common-defs.make                         |  91 --
 make/cuda-v11-defs.make                       |  17 -
 make/cuda-v12-defs.make                       |  17 -
 make/cuda.make                                |  56 --
 make/gpu.make                                 |  89 --
 make/rocm-defs.make                           |   9 -
 ml/backend/ggml/ggml/.rsync-filter            |  22 +
 ml/backend/ggml/ggml/LICENSE                  |  21 +
 .../backend/ggml/ggml/include}/ggml-alloc.h   |  26 -
 .../backend/ggml/ggml/include}/ggml-backend.h |  26 -
 ml/backend/ggml/ggml/include/ggml-blas.h      |  25 +
 ml/backend/ggml/ggml/include/ggml-cann.h      | 123 +++
 .../backend/ggml/ggml/include}/ggml-cpp.h     |  26 -
 .../backend/ggml/ggml/include}/ggml-cpu.h     |  26 -
 .../backend/ggml/ggml/include}/ggml-cuda.h    |  26 -
 ml/backend/ggml/ggml/include/ggml-kompute.h   |  50 +
 .../backend/ggml/ggml/include}/ggml-metal.h   |  26 -
 ml/backend/ggml/ggml/include/ggml-opencl.h    |  26 +
 ml/backend/ggml/ggml/include/ggml-opt.h       | 216 ++++
 ml/backend/ggml/ggml/include/ggml-rpc.h       |  28 +
 ml/backend/ggml/ggml/include/ggml-sycl.h      |  49 +
 ml/backend/ggml/ggml/include/ggml-vulkan.h    |  31 +
 .../backend/ggml/ggml/include}/ggml.h         |  26 -
 ml/backend/ggml/ggml/src/CMakeLists.txt       | 340 +++++++
 .../backend/ggml/ggml/src}/ggml-alloc.c       |  26 -
 .../ggml/ggml/src}/ggml-backend-impl.h        |  26 -
 .../ggml/ggml/src}/ggml-backend-reg.cpp       |  47 +-
 .../backend/ggml/ggml/src}/ggml-backend.cpp   |  32 -
 .../ggml/ggml/src/ggml-blas/CMakeLists.txt    |  87 ++
 ml/backend/ggml/ggml/src/ggml-blas/blas.go    |  10 +
 .../ggml/ggml/src/ggml-blas}/ggml-blas.cpp    |  30 -
 .../backend/ggml/ggml/src}/ggml-common.h      |  26 -
 .../ggml/ggml/src/ggml-cpu/CMakeLists.txt     | 346 +++++++
 .../ggml/ggml/src/ggml-cpu/amx}/amx.cpp       |  26 -
 ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h   |   8 +
 .../ggml/ggml/src/ggml-cpu/amx/common.h       |  91 ++
 .../ggml/ggml/src/ggml-cpu/amx}/mmq.cpp       |  26 -
 ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h   |  10 +
 .../ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp  | 323 ++++++
 ml/backend/ggml/ggml/src/ggml-cpu/cpu.go      |  11 +
 .../ggml/src/ggml-cpu}/ggml-cpu-aarch64.cpp   |  26 -
 .../ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.h |   8 +
 .../ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp   |  55 ++
 .../ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.h     |   8 +
 .../ggml/ggml/src/ggml-cpu}/ggml-cpu-impl.h   |  26 -
 .../ggml/ggml/src/ggml-cpu}/ggml-cpu-quants.c |  26 -
 .../ggml/ggml/src/ggml-cpu}/ggml-cpu-quants.h |  26 -
 .../ggml/src/ggml-cpu}/ggml-cpu-traits.cpp    |  26 -
 .../ggml/ggml/src/ggml-cpu/ggml-cpu-traits.h  |  38 +
 .../ggml/ggml/src/ggml-cpu}/ggml-cpu.c        |  28 +-
 .../ggml/ggml/src/ggml-cpu}/ggml-cpu.cpp      |  29 +-
 .../ggml/src/ggml-cpu/llamafile/llamafile.go  |   5 +
 .../ggml/src/ggml-cpu/llamafile}/sgemm.cpp    |   0
 .../ggml/ggml/src/ggml-cpu}/llamafile/sgemm.h |   0
 .../ggml/ggml/src/ggml-cuda/CMakeLists.txt    | 152 +++
 .../backend/ggml/ggml/src}/ggml-cuda/acc.cu   |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh    |   5 +
 ml/backend/ggml/ggml/src/ggml-cuda/arange.cu  |  34 +
 ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh |   5 +
 .../ggml/ggml/src}/ggml-cuda/argmax.cu        |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh |   3 +
 .../ggml/ggml/src}/ggml-cuda/argsort.cu       |  26 -
 .../ggml/ggml/src/ggml-cuda/argsort.cuh       |   3 +
 .../ggml/ggml/src}/ggml-cuda/binbcast.cu      |  26 -
 .../ggml/ggml/src/ggml-cuda/binbcast.cuh      |   9 +
 ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu   |  34 +
 ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh  |   5 +
 .../ggml/ggml/src}/ggml-cuda/common.cuh       |  26 -
 .../ggml/ggml/src}/ggml-cuda/concat.cu        |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh |   5 +
 .../ggml/src}/ggml-cuda/conv-transpose-1d.cu  |  26 -
 .../ggml/src/ggml-cuda/conv-transpose-1d.cuh  |   5 +
 .../ggml/ggml/src}/ggml-cuda/convert.cu       |  26 -
 .../ggml/ggml/src/ggml-cuda/convert.cuh       |  13 +
 .../ggml/ggml/src}/ggml-cuda/count-equal.cu   |  26 -
 .../ggml/ggml/src/ggml-cuda/count-equal.cuh   |   5 +
 .../backend/ggml/ggml/src}/ggml-cuda/cpy.cu   |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh    |   9 +
 .../ggml/src}/ggml-cuda/cross-entropy-loss.cu |  26 -
 .../ggml/src/ggml-cuda/cross-entropy-loss.cuh |   7 +
 .../ggml/ggml/src}/ggml-cuda/dequantize.cuh   |  26 -
 .../ggml/ggml/src}/ggml-cuda/diagmask.cu      |  26 -
 .../ggml/ggml/src/ggml-cuda/diagmask.cuh      |   5 +
 .../ggml/ggml/src}/ggml-cuda/fattn-common.cuh |  26 -
 .../ggml/src}/ggml-cuda/fattn-tile-f16.cu     |  26 -
 .../ggml/src/ggml-cuda/fattn-tile-f16.cuh     |   3 +
 .../ggml/src}/ggml-cuda/fattn-tile-f32.cu     |  26 -
 .../ggml/src/ggml-cuda/fattn-tile-f32.cuh     |   3 +
 .../ggml/src}/ggml-cuda/fattn-vec-f16.cuh     |  26 -
 .../ggml/src}/ggml-cuda/fattn-vec-f32.cuh     |  26 -
 .../ggml/src}/ggml-cuda/fattn-wmma-f16.cuh    |  26 -
 .../backend/ggml/ggml/src}/ggml-cuda/fattn.cu |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/fattn.cuh  |   3 +
 .../ggml/ggml/src}/ggml-cuda/getrows.cu       |  26 -
 .../ggml/ggml/src/ggml-cuda/getrows.cuh       |   5 +
 .../ggml/ggml/src}/ggml-cuda/ggml-cuda.cu     |  31 +-
 .../ggml/ggml/src}/ggml-cuda/im2col.cu        |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/im2col.cuh |   5 +
 .../backend/ggml/ggml/src}/ggml-cuda/mma.cuh  |  26 -
 .../backend/ggml/ggml/src}/ggml-cuda/mmq.cu   |  26 -
 .../backend/ggml/ggml/src}/ggml-cuda/mmq.cuh  |  26 -
 .../backend/ggml/ggml/src}/ggml-cuda/mmv.cu   |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh    |  12 +
 .../backend/ggml/ggml/src}/ggml-cuda/mmvq.cu  |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh   |   9 +
 .../backend/ggml/ggml/src}/ggml-cuda/norm.cu  |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/norm.cuh   |   7 +
 .../ggml/src}/ggml-cuda/opt-step-adamw.cu     |  26 -
 .../ggml/src/ggml-cuda/opt-step-adamw.cuh     |   5 +
 .../ggml/ggml/src}/ggml-cuda/out-prod.cu      |  26 -
 .../ggml/ggml/src/ggml-cuda/out-prod.cuh      |   3 +
 .../backend/ggml/ggml/src}/ggml-cuda/pad.cu   |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh    |   6 +
 .../ggml/ggml/src}/ggml-cuda/pool2d.cu        |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cuh |   5 +
 .../ggml/ggml/src}/ggml-cuda/quantize.cu      |  26 -
 .../ggml/ggml/src/ggml-cuda/quantize.cuh      |  24 +
 .../backend/ggml/ggml/src}/ggml-cuda/rope.cu  |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/rope.cuh   |   5 +
 ml/backend/ggml/ggml/src/ggml-cuda/scale.cu   |  31 +
 ml/backend/ggml/ggml/src/ggml-cuda/scale.cuh  |   5 +
 .../ggml/ggml/src}/ggml-cuda/softmax.cu       |  26 -
 .../ggml/ggml/src/ggml-cuda/softmax.cuh       |   5 +
 .../backend/ggml/ggml/src}/ggml-cuda/sum.cu   |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/sum.cuh    |   5 +
 ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu |  39 +
 .../ggml/ggml/src/ggml-cuda/sumrows.cuh       |   5 +
 .../fattn-vec-f16-instance-hs128-f16-f16.cu   |   5 +
 .../fattn-vec-f16-instance-hs128-f16-q4_0.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-f16-q4_1.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-f16-q5_0.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-f16-q5_1.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-f16-q8_0.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-q4_0-f16.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_1-f16.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_0-f16.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_1-f16.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q8_0-f16.cu  |   5 +
 .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu |   5 +
 .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu |   5 +
 .../fattn-vec-f16-instance-hs256-f16-f16.cu   |   5 +
 .../fattn-vec-f16-instance-hs64-f16-f16.cu    |   5 +
 .../fattn-vec-f16-instance-hs64-f16-q4_0.cu   |   5 +
 .../fattn-vec-f16-instance-hs64-f16-q4_1.cu   |   5 +
 .../fattn-vec-f16-instance-hs64-f16-q5_0.cu   |   5 +
 .../fattn-vec-f16-instance-hs64-f16-q5_1.cu   |   5 +
 .../fattn-vec-f16-instance-hs64-f16-q8_0.cu   |   5 +
 .../fattn-vec-f32-instance-hs128-f16-f16.cu   |   5 +
 .../fattn-vec-f32-instance-hs128-f16-q4_0.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-f16-q4_1.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-f16-q5_0.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-f16-q5_1.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-f16-q8_0.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-q4_0-f16.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_1-f16.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_0-f16.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_1-f16.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q8_0-f16.cu  |   5 +
 .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu |   5 +
 .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu |   5 +
 .../fattn-vec-f32-instance-hs256-f16-f16.cu   |   5 +
 .../fattn-vec-f32-instance-hs64-f16-f16.cu    |   5 +
 .../fattn-vec-f32-instance-hs64-f16-q4_0.cu   |   5 +
 .../fattn-vec-f32-instance-hs64-f16-q4_1.cu   |   5 +
 .../fattn-vec-f32-instance-hs64-f16-q5_0.cu   |   5 +
 .../fattn-vec-f32-instance-hs64-f16-q5_1.cu   |   5 +
 .../fattn-vec-f32-instance-hs64-f16-q8_0.cu   |   5 +
 .../fattn-wmma-f16-instance-kqfloat-cpb16.cu  |  10 +
 .../fattn-wmma-f16-instance-kqfloat-cpb32.cu  |   9 +
 .../fattn-wmma-f16-instance-kqhalf-cpb16.cu   |  10 +
 .../fattn-wmma-f16-instance-kqhalf-cpb32.cu   |  10 +
 .../fattn-wmma-f16-instance-kqhalf-cpb8.cu    |   8 +
 .../template-instances/generate_cu_files.py   |  77 ++
 .../template-instances/mmq-instance-iq1_s.cu  |   5 +
 .../template-instances/mmq-instance-iq2_s.cu  |   5 +
 .../template-instances/mmq-instance-iq2_xs.cu |   5 +
 .../mmq-instance-iq2_xxs.cu                   |   5 +
 .../template-instances/mmq-instance-iq3_s.cu  |   5 +
 .../mmq-instance-iq3_xxs.cu                   |   5 +
 .../template-instances/mmq-instance-iq4_nl.cu |   5 +
 .../template-instances/mmq-instance-iq4_xs.cu |   5 +
 .../template-instances/mmq-instance-q2_k.cu   |   5 +
 .../template-instances/mmq-instance-q3_k.cu   |   5 +
 .../template-instances/mmq-instance-q4_0.cu   |   5 +
 .../template-instances/mmq-instance-q4_1.cu   |   5 +
 .../template-instances/mmq-instance-q4_k.cu   |   5 +
 .../template-instances/mmq-instance-q5_0.cu   |   5 +
 .../template-instances/mmq-instance-q5_1.cu   |   5 +
 .../template-instances/mmq-instance-q5_k.cu   |   5 +
 .../template-instances/mmq-instance-q6_k.cu   |   5 +
 .../template-instances/mmq-instance-q8_0.cu   |   5 +
 .../ggml/ggml/src}/ggml-cuda/tsembd.cu        |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cuh |   5 +
 .../backend/ggml/ggml/src}/ggml-cuda/unary.cu |  26 -
 .../ggml/ggml/src}/ggml-cuda/unary.cuh        |  26 -
 .../ggml/ggml/src}/ggml-cuda/upscale.cu       |  26 -
 .../ggml/ggml/src/ggml-cuda/upscale.cuh       |   5 +
 .../ggml/ggml/src}/ggml-cuda/vecdotq.cuh      |  26 -
 .../ggml/ggml/src/ggml-cuda/vendors/cuda.h    |  14 +
 .../ggml/ggml/src}/ggml-cuda/vendors/hip.h    |  26 -
 .../ggml/ggml/src}/ggml-cuda/vendors/musa.h   |  26 -
 .../backend/ggml/ggml/src}/ggml-cuda/wkv6.cu  |  26 -
 ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cuh   |   5 +
 .../ggml/ggml/src/ggml-hip/CMakeLists.txt     | 104 ++
 .../backend/ggml/ggml/src}/ggml-impl.h        |  26 -
 .../ggml/ggml/src/ggml-metal/CMakeLists.txt   | 121 +++
 .../src/ggml-metal}/ggml-metal-embed.metal    |  79 +-
 .../ggml/src/ggml-metal/ggml-metal-embed.s    |   2 +-
 .../ggml/src/ggml-metal}/ggml-metal-impl.h    |  26 -
 .../ggml/ggml/src/ggml-metal/ggml-metal.m     |  27 +-
 .../ggml/src/ggml-metal}/ggml-metal.metal     |  26 -
 ml/backend/ggml/ggml/src/ggml-metal/metal.go  |   9 +
 ml/backend/ggml/ggml/src/ggml-opt.cpp         | 854 ++++++++++++++++
 .../backend/ggml/ggml/src}/ggml-quants.c      |  28 +-
 .../backend/ggml/ggml/src}/ggml-quants.h      |  26 -
 ml/backend/ggml/ggml/src/ggml-threading.cpp   |  12 +
 ml/backend/ggml/ggml/src/ggml-threading.h     |  14 +
 {llama => ml/backend/ggml/ggml/src}/ggml.c    |  26 -
 ml/backend/ggml/ggml/src/ggml.go              |  81 ++
 ml/backend/ggml/ggml/src/ggml_darwin_arm64.go |  10 +
 ml/backend/ggml/ggml_debug.go                 |   6 +
 runners/common.go                             | 207 ----
 scripts/build.sh                              |  21 -
 scripts/build_darwin.sh                       | 127 ++-
 scripts/build_linux.sh                        |   4 +-
 scripts/build_windows.ps1                     |  51 +-
 scripts/fast.sh                               |  20 -
 scripts/publish.sh                            |  25 -
 scripts/rh_linux_deps.sh                      |  78 --
 server/routes.go                              |   9 -
 542 files changed, 5796 insertions(+), 11469 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 CMakePresets.json
 delete mode 100644 Makefile
 create mode 100644 Makefile.sync
 create mode 100644 discover/path.go
 delete mode 100644 llama/amx.h
 delete mode 100644 llama/ggml-blas.h
 delete mode 100644 llama/ggml-cpu-aarch64.h
 delete mode 100644 llama/ggml-cpu-traits.h
 delete mode 100644 llama/ggml-cuda/acc.cuh
 delete mode 100644 llama/ggml-cuda/arange.cu
 delete mode 100644 llama/ggml-cuda/arange.cuh
 delete mode 100644 llama/ggml-cuda/argmax.cuh
 delete mode 100644 llama/ggml-cuda/argsort.cuh
 delete mode 100644 llama/ggml-cuda/binbcast.cuh
 delete mode 100644 llama/ggml-cuda/clamp.cu
 delete mode 100644 llama/ggml-cuda/clamp.cuh
 delete mode 100644 llama/ggml-cuda/concat.cuh
 delete mode 100644 llama/ggml-cuda/conv-transpose-1d.cuh
 delete mode 100644 llama/ggml-cuda/convert.cuh
 delete mode 100644 llama/ggml-cuda/count-equal.cuh
 delete mode 100644 llama/ggml-cuda/cpy.cuh
 delete mode 100644 llama/ggml-cuda/cross-entropy-loss.cuh
 delete mode 100644 llama/ggml-cuda/diagmask.cuh
 delete mode 100644 llama/ggml-cuda/fattn-tile-f16.cuh
 delete mode 100644 llama/ggml-cuda/fattn-tile-f32.cuh
 delete mode 100644 llama/ggml-cuda/fattn.cuh
 delete mode 100644 llama/ggml-cuda/getrows.cuh
 delete mode 100644 llama/ggml-cuda/im2col.cuh
 delete mode 100644 llama/ggml-cuda/mmv.cuh
 delete mode 100644 llama/ggml-cuda/mmvq.cuh
 delete mode 100644 llama/ggml-cuda/norm.cuh
 delete mode 100644 llama/ggml-cuda/opt-step-adamw.cuh
 delete mode 100644 llama/ggml-cuda/out-prod.cuh
 delete mode 100644 llama/ggml-cuda/pad.cuh
 delete mode 100644 llama/ggml-cuda/pool2d.cuh
 delete mode 100644 llama/ggml-cuda/quantize.cuh
 delete mode 100644 llama/ggml-cuda/rope.cuh
 delete mode 100644 llama/ggml-cuda/scale.cu
 delete mode 100644 llama/ggml-cuda/scale.cuh
 delete mode 100644 llama/ggml-cuda/softmax.cuh
 delete mode 100644 llama/ggml-cuda/sum.cuh
 delete mode 100644 llama/ggml-cuda/sumrows.cu
 delete mode 100644 llama/ggml-cuda/sumrows.cuh
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
 delete mode 100644 llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
 delete mode 100644 llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
 delete mode 100644 llama/ggml-cuda/tsembd.cuh
 delete mode 100644 llama/ggml-cuda/upscale.cuh
 delete mode 100644 llama/ggml-cuda/vendors/cuda.h
 delete mode 100644 llama/ggml-cuda/wkv6.cuh
 delete mode 100644 llama/ggml-threading.cpp
 delete mode 100644 llama/ggml-threading.h
 delete mode 100644 llama/json-schema-to-grammar.h
 delete mode 100644 llama/llama-cparams.cpp
 delete mode 100644 llama/llama-cparams.h
 delete mode 100644 llama/llama-cpp.h
 delete mode 100644 llama/llama-quant.h
 create mode 100644 llama/llama.cpp/.rsync-filter
 create mode 100644 llama/llama.cpp/LICENSE
 rename llama/{ => llama.cpp/common}/base64.hpp (100%)
 rename llama/{ => llama.cpp/common}/common.cpp (98%)
 create mode 100644 llama/llama.cpp/common/common.go
 rename llama/{ => llama.cpp/common}/common.h (95%)
 rename llama/{ => llama.cpp/common}/json-schema-to-grammar.cpp (97%)
 create mode 100644 llama/llama.cpp/common/json-schema-to-grammar.h
 rename llama/{ => llama.cpp/common}/json.hpp (100%)
 rename llama/{ => llama.cpp/common}/log.cpp (89%)
 rename llama/{ => llama.cpp/common}/log.h (77%)
 rename llama/{ => llama.cpp/common}/sampling.cpp (93%)
 rename llama/{ => llama.cpp/common}/sampling.h (78%)
 rename llama/{ => llama.cpp/common}/stb_image.h (100%)
 rename llama/{ => llama.cpp/examples/llava}/clip.cpp (98%)
 rename llama/{ => llama.cpp/examples/llava}/clip.h (74%)
 rename llama/{ => llama.cpp/examples/llava}/llava.cpp (95%)
 create mode 100644 llama/llama.cpp/examples/llava/llava.go
 rename llama/{ => llama.cpp/examples/llava}/llava.h (59%)
 create mode 100644 llama/llama.cpp/include/llama-cpp.h
 rename llama/{ => llama.cpp/include}/llama.h (98%)
 rename llama/{ => llama.cpp/src}/llama-adapter.cpp (90%)
 rename llama/{ => llama.cpp/src}/llama-adapter.h (55%)
 rename llama/{ => llama.cpp/src}/llama-arch.cpp (98%)
 rename llama/{ => llama.cpp/src}/llama-arch.h (89%)
 rename llama/{ => llama.cpp/src}/llama-batch.cpp (91%)
 rename llama/{ => llama.cpp/src}/llama-batch.h (67%)
 rename llama/{ => llama.cpp/src}/llama-chat.cpp (95%)
 rename llama/{ => llama.cpp/src}/llama-chat.h (54%)
 rename llama/{ => llama.cpp/src}/llama-context.cpp (98%)
 rename llama/{ => llama.cpp/src}/llama-context.h (80%)
 create mode 100644 llama/llama.cpp/src/llama-cparams.cpp
 create mode 100644 llama/llama.cpp/src/llama-cparams.h
 rename llama/{ => llama.cpp/src}/llama-grammar.cpp (97%)
 rename llama/{ => llama.cpp/src}/llama-grammar.h (78%)
 rename llama/{ => llama.cpp/src}/llama-hparams.cpp (61%)
 rename llama/{ => llama.cpp/src}/llama-hparams.h (78%)
 rename llama/{ => llama.cpp/src}/llama-impl.cpp (82%)
 rename llama/{ => llama.cpp/src}/llama-impl.h (58%)
 rename llama/{ => llama.cpp/src}/llama-kv-cache.cpp (95%)
 rename llama/{ => llama.cpp/src}/llama-kv-cache.h (84%)
 rename llama/{ => llama.cpp/src}/llama-mmap.cpp (93%)
 rename llama/{ => llama.cpp/src}/llama-mmap.h (52%)
 rename llama/{ => llama.cpp/src}/llama-model-loader.cpp (97%)
 rename llama/{ => llama.cpp/src}/llama-model-loader.h (81%)
 rename llama/{ => llama.cpp/src}/llama-model.cpp (98%)
 rename llama/{ => llama.cpp/src}/llama-model.h (91%)
 rename llama/{ => llama.cpp/src}/llama-quant.cpp (97%)
 create mode 100644 llama/llama.cpp/src/llama-quant.h
 rename llama/{ => llama.cpp/src}/llama-sampling.cpp (98%)
 rename llama/{ => llama.cpp/src}/llama-sampling.h (54%)
 rename llama/{ => llama.cpp/src}/llama-vocab.cpp (98%)
 rename llama/{ => llama.cpp/src}/llama-vocab.h (84%)
 rename llama/{ => llama.cpp/src}/llama.cpp (99%)
 create mode 100644 llama/llama.cpp/src/llama.go
 rename llama/{ => llama.cpp/src}/unicode-data.cpp (99%)
 create mode 100644 llama/llama.cpp/src/unicode-data.h
 rename llama/{ => llama.cpp/src}/unicode.cpp (96%)
 rename llama/{ => llama.cpp/src}/unicode.h (63%)
 delete mode 100644 llama/mmq.h
 rename llama/patches/{0008-add-mllama-support.patch => 0007-add-mllama-support.patch} (100%)
 delete mode 100644 llama/patches/0007-blas.patch
 rename llama/patches/{0009-add-unpad-operator.patch => 0008-add-unpad-operator.patch} (97%)
 rename llama/patches/{0010-fix-deepseek-deseret-regex.patch => 0009-fix-deepseek-deseret-regex.patch} (100%)
 rename llama/patches/{0012-Maintain-ordering-for-rules-for-grammar.patch => 0010-Maintain-ordering-for-rules-for-grammar.patch} (100%)
 rename llama/patches/{0013-fix-missing-arg-in-static-assert-on-windows.patch => 0011-fix-missing-arg-in-static-assert-on-windows.patch} (100%)
 delete mode 100644 llama/patches/0011-relative-include-paths.patch
 rename llama/patches/{0014-llama-Ensure-KV-cache-is-fully-defragmented.patch => 0012-llama-Ensure-KV-cache-is-fully-defragmented.patch} (100%)
 rename llama/patches/{0015-re-enable-gpu-for-clip.patch => 0013-re-enable-gpu-for-clip.patch} (100%)
 create mode 100644 llama/patches/0014-sort-devices-by-score.patch
 create mode 100644 llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
 delete mode 100644 llama/sgemm.h
 delete mode 100644 llama/unicode-data.h
 delete mode 100644 make/Makefile.cpu
 delete mode 100644 make/Makefile.cuda_v11
 delete mode 100644 make/Makefile.cuda_v12
 delete mode 100644 make/Makefile.ollama
 delete mode 100644 make/Makefile.rocm
 delete mode 100644 make/Makefile.sync
 delete mode 100644 make/Makefile.test
 delete mode 100644 make/common-defs.make
 delete mode 100644 make/cuda-v11-defs.make
 delete mode 100644 make/cuda-v12-defs.make
 delete mode 100644 make/cuda.make
 delete mode 100644 make/gpu.make
 delete mode 100644 make/rocm-defs.make
 create mode 100644 ml/backend/ggml/ggml/.rsync-filter
 create mode 100644 ml/backend/ggml/ggml/LICENSE
 rename {llama => ml/backend/ggml/ggml/include}/ggml-alloc.h (70%)
 rename {llama => ml/backend/ggml/ggml/include}/ggml-backend.h (94%)
 create mode 100644 ml/backend/ggml/ggml/include/ggml-blas.h
 create mode 100644 ml/backend/ggml/ggml/include/ggml-cann.h
 rename {llama => ml/backend/ggml/ggml/include}/ggml-cpp.h (56%)
 rename {llama => ml/backend/ggml/ggml/include}/ggml-cpu.h (84%)
 rename {llama => ml/backend/ggml/ggml/include}/ggml-cuda.h (56%)
 create mode 100644 ml/backend/ggml/ggml/include/ggml-kompute.h
 rename {llama => ml/backend/ggml/ggml/include}/ggml-metal.h (66%)
 create mode 100644 ml/backend/ggml/ggml/include/ggml-opencl.h
 create mode 100644 ml/backend/ggml/ggml/include/ggml-opt.h
 create mode 100644 ml/backend/ggml/ggml/include/ggml-rpc.h
 create mode 100644 ml/backend/ggml/ggml/include/ggml-sycl.h
 create mode 100644 ml/backend/ggml/ggml/include/ggml-vulkan.h
 rename {llama => ml/backend/ggml/ggml/include}/ggml.h (98%)
 create mode 100644 ml/backend/ggml/ggml/src/CMakeLists.txt
 rename {llama => ml/backend/ggml/ggml/src}/ggml-alloc.c (96%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-backend-impl.h (90%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-backend-reg.cpp (90%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-backend.cpp (98%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt
 create mode 100644 ml/backend/ggml/ggml/src/ggml-blas/blas.go
 rename {llama => ml/backend/ggml/ggml/src/ggml-blas}/ggml-blas.cpp (92%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-common.h (99%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu/amx}/amx.cpp (86%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu/amx}/mmq.cpp (98%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/ggml-cpu-aarch64.cpp (99%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.h
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/ggml-cpu-impl.h (87%)
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/ggml-cpu-quants.c (99%)
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/ggml-cpu-quants.h (80%)
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/ggml-cpu-traits.cpp (50%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-traits.h
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/ggml-cpu.c (99%)
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/ggml-cpu.cpp (94%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu/llamafile}/sgemm.cpp (100%)
 rename {llama => ml/backend/ggml/ggml/src/ggml-cpu}/llamafile/sgemm.h (100%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/acc.cu (61%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/arange.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/argmax.cu (69%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/argsort.cu (73%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/binbcast.cu (91%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/common.cuh (94%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/concat.cu (85%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/conv-transpose-1d.cu (72%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/convert.cu (95%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/count-equal.cu (62%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/cpy.cu (94%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/cross-entropy-loss.cu (82%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/dequantize.cuh (68%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/diagmask.cu (58%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/fattn-common.cuh (95%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/fattn-tile-f16.cu (91%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/fattn-tile-f32.cu (91%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/fattn-vec-f16.cuh (93%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/fattn-vec-f32.cuh (92%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/fattn-wmma-f16.cuh (94%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/fattn.cu (92%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/fattn.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/getrows.cu (84%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/ggml-cuda.cu (98%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/im2col.cu (78%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/im2col.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/mma.cuh (86%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/mmq.cu (80%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/mmq.cuh (98%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/mmv.cu (89%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/mmvq.cu (93%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/norm.cu (85%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/norm.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/opt-step-adamw.cu (70%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/out-prod.cu (57%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/pad.cu (74%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/pool2d.cu (72%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/quantize.cu (81%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/rope.cu (94%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/rope.cuh
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/scale.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/scale.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/softmax.cu (86%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/softmax.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/sum.cu (54%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/sum.cuh
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
 create mode 100755 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/tsembd.cu (59%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/unary.cu (92%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/unary.cuh (58%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/upscale.cu (63%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/upscale.cuh
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/vecdotq.cuh (96%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/vendors/hip.h (85%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/vendors/musa.h (83%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-cuda/wkv6.cu (71%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cuh
 create mode 100644 ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
 rename {llama => ml/backend/ggml/ggml/src}/ggml-impl.h (93%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-metal/CMakeLists.txt
 rename {llama => ml/backend/ggml/ggml/src/ggml-metal}/ggml-metal-embed.metal (99%)
 rename llama/ggml-metal-embed_darwin_arm64.s => ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s (87%)
 rename {llama => ml/backend/ggml/ggml/src/ggml-metal}/ggml-metal-impl.h (81%)
 rename llama/ggml-metal_darwin_arm64.m => ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m (99%)
 rename {llama => ml/backend/ggml/ggml/src/ggml-metal}/ggml-metal.metal (99%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-metal/metal.go
 create mode 100644 ml/backend/ggml/ggml/src/ggml-opt.cpp
 rename {llama => ml/backend/ggml/ggml/src}/ggml-quants.c (99%)
 rename {llama => ml/backend/ggml/ggml/src}/ggml-quants.h (87%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml-threading.cpp
 create mode 100644 ml/backend/ggml/ggml/src/ggml-threading.h
 rename {llama => ml/backend/ggml/ggml/src}/ggml.c (99%)
 create mode 100644 ml/backend/ggml/ggml/src/ggml.go
 create mode 100644 ml/backend/ggml/ggml/src/ggml_darwin_arm64.go
 create mode 100644 ml/backend/ggml/ggml_debug.go
 delete mode 100644 runners/common.go
 delete mode 100644 scripts/build.sh
 delete mode 100755 scripts/fast.sh
 delete mode 100755 scripts/publish.sh
 delete mode 100644 scripts/rh_linux_deps.sh

diff --git a/.dockerignore b/.dockerignore
index 76704c36d..02d796fe7 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,7 +3,9 @@ ollama
 app
 macapp
 dist
+build
 .env
 .cache
 test_data
-llama/build
+.git
+
diff --git a/.gitattributes b/.gitattributes
index 51635caa7..4bcd95b0b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,5 +7,14 @@ llama/**/*.cuh linguist-vendored
 llama/**/*.m linguist-vendored
 llama/**/*.metal linguist-vendored
 
+ml/backend/**/*.c linguist-vendored
+ml/backend/**/*.h linguist-vendored
+ml/backend/**/*.cpp linguist-vendored
+ml/backend/**/*.hpp linguist-vendored
+ml/backend/**/*.cu linguist-vendored
+ml/backend/**/*.cuh linguist-vendored
+ml/backend/**/*.m linguist-vendored
+ml/backend/**/*.metal linguist-vendored
+
 * text=auto
 *.go text eol=lf
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 50177050c..f9ab533aa 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,31 +1,62 @@
 name: release
 
-env:
-  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
-
 on:
   push:
     tags:
       - 'v*'
 
 jobs:
-  # Full build of the Mac assets
-  build-darwin:
-    runs-on: macos-13
+  setup-environment:
+    runs-on: ubuntu-latest
     environment: release
+    outputs:
+      GOFLAGS: ${{ steps.goflags.outputs.GOFLAGS }}
     steps:
       - uses: actions/checkout@v4
-      - name: Set Version
-        shell: bash
+      - name: Set environment
+        id: goflags
         run: |
-          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
-      - name: key
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${GITHUB_REF_NAME#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_OUTPUT
+
+  darwin-build:
+    runs-on: macos-13
+    environment: release
+    needs: setup-environment
+    strategy:
+      matrix:
+        os: [darwin]
+        arch: [amd64, arm64]
+    env:
+      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+      - run: |
+          go build -o dist/ .
         env:
-          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
-          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
+          GOOS: ${{ matrix.os }}
+          GOARCH: ${{ matrix.arch }}
+          CGO_ENABLED: 1
+          CGO_CPPFLAGS: '-mmacosx-version-min=11.3'
+      - if: matrix.arch == 'amd64'
         run: |
+          cmake --preset CPU -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64
+          cmake --build --parallel --preset CPU
+          cmake --install build --component CPU --strip --parallel 8
+      - uses: actions/upload-artifact@v4
+        with:
+          name: build-${{ matrix.os }}-${{ matrix.arch }}
+          path: dist/*
+
+  darwin-sign:
+    runs-on: macos-13
+    environment: release
+    needs: darwin-build
+    steps:
+      - uses: actions/checkout@v4
+      - run: |
           echo $MACOS_SIGNING_KEY | base64 --decode > certificate.p12
           security create-keychain -p password build.keychain
           security default-keychain -s build.keychain
@@ -33,11 +64,20 @@ jobs:
           security import certificate.p12 -k build.keychain -P $MACOS_SIGNING_KEY_PASSWORD -T /usr/bin/codesign
           security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k password build.keychain
           security set-keychain-settings -lut 3600 build.keychain
-      - uses: actions/setup-go@v5
+        env:
+          MACOS_SIGNING_KEY: ${{ secrets.MACOS_SIGNING_KEY }}
+          MACOS_SIGNING_KEY_PASSWORD: ${{ secrets.MACOS_SIGNING_KEY_PASSWORD }}
+      - uses: actions/download-artifact@v4
         with:
-          go-version-file: go.mod
-          cache: true
-      - name: Build Darwin
+          name: build-darwin-amd64
+          path: dist/darwin-amd64
+      - uses: actions/download-artifact@v4
+        with:
+          name: build-darwin-arm64
+          path: dist/darwin-arm64
+      - run: |
+          export VERSION=${GITHUB_REF_NAME#v}
+          ./scripts/build_darwin.sh macapp sign
         env:
           APPLE_IDENTITY: ${{ secrets.APPLE_IDENTITY }}
           APPLE_PASSWORD: ${{ secrets.APPLE_PASSWORD }}
@@ -45,684 +85,269 @@ jobs:
           APPLE_ID: ${{ vars.APPLE_ID }}
           SDKROOT: /Applications/Xcode_14.1.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
           DEVELOPER_DIR: /Applications/Xcode_14.1.0.app/Contents/Developer
-        run: |
-          ./scripts/build_darwin.sh
-
       - uses: actions/upload-artifact@v4
         with:
           name: dist-darwin
           path: |
             dist/Ollama-darwin.zip
-            dist/ollama-darwin
+            dist/ollama-darwin.tgz
 
-  # Windows builds take a long time to both install the dependencies and build, so parallelize
-  # CPU generation step
-  generate-windows-cpu:
-    environment: release
-    runs-on: windows
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make dist
-        name: make
-      - uses: actions/upload-artifact@v4
-        with:
-          name: generate-windows-cpu
-          path: |
-            dist/windows-amd64/**
-
-  # ROCm generation step
-  generate-windows-rocm:
-    environment: release
-    runs-on: windows
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      # ROCM installation steps
-      - name: 'Cache ROCm installer'
-        id: cache-rocm
-        uses: actions/cache@v4
-        with:
-          path: rocm-install.exe
-          key: ${{ env.ROCM_WINDOWS_URL }}
-      - name: 'Conditionally Download ROCm'
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
-      - name: 'Install ROCm'
-        run: |
-          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-      - name: 'Verify ROCm'
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - name: make rocm runner
-        run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make help-runners
-          make dist_rocm
-      - uses: actions/upload-artifact@v4
-        with:
-          name: generate-windows-rocm
-          path: |
-            dist/windows-amd64/**
-
-  # CUDA generation step
-  generate-windows-cuda:
-    environment: release
-    runs-on: windows
+  windows-depends:
     strategy:
       matrix:
-        cuda:
-          - version: "11.3"
-            url: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
-          - version: "12.4"
-            url: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
+        os: [windows]
+        arch: [amd64]
+        preset: ['CPU']
+        include:
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 11'
+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
+            cuda-version: '11.3'
+          - os: windows
+            arch: amd64
+            preset: 'CUDA 12'
+            install: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
+            cuda-version: '12.4'
+          - os: windows
+            arch: amd64
+            preset: 'ROCm 6'
+            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
+            rocm-version: '6.1'
+    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
+    environment: release
     env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
+      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
     steps:
-      - uses: actions/checkout@v4
-      - name: Set make jobs default
+      - name: Install system dependencies
         run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Install msys2
-        run: |
-          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
-          write-host "Downloading msys2"
-          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
-          write-host "Installing msys2"
-          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: verify tools
-        run: |
-          get-command gcc
-          gcc --version
-          get-command make
-          make --version
-      - uses: actions/setup-go@v5
+          choco install -y --no-progress ccache ninja
+          ccache -o cache_dir=${{ github.workspace }}\.ccache
+      - if: startsWith(matrix.preset, 'CUDA ') || startsWith(matrix.preset, 'ROCm ')
+        id: cache-install
+        uses: actions/cache/restore@v4
         with:
-          go-version-file: go.mod
-          cache: true
-      # CUDA installation steps
-      - name: 'Cache CUDA installer'
-        id: cache-cuda
-        uses: actions/cache@v4
-        with:
-          path: cuda-install.exe
-          key: ${{ matrix.cuda.url }}
-      - name: 'Conditionally Download CUDA'
-        if: steps.cache-cuda.outputs.cache-hit != 'true'
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
+            C:\Program Files\AMD\ROCm
+          key: ${{ matrix.install }}
+      - if: startsWith(matrix.preset, 'CUDA ')
+        name: Install CUDA ${{ matrix.cuda-version }}
         run: |
           $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "cuda-install.exe"
-      - name: 'Install CUDA'
-        run: |
-          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ matrix.cuda.version }}"}
-          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
-      - name: 'Verify CUDA'
-        run: |
-          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
-          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
-          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
+            $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | Foreach-Object {"${_}_${{ matrix.cuda-version }}"}
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
+          }
+
+          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
           echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      - name: make cuda runner
+      - if: startsWith(matrix.preset, 'ROCm')
+        name: Install ROCm ${{ matrix.rocm-version }}
         run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+          $ErrorActionPreference = "Stop"
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
+            Start-Process -FilePath .\install.exe -ArgumentList '-install' -NoNewWindow -Wait
+          }
+
+          $hipPath = (Resolve-Path "C:\Program Files\AMD\ROCm\*").path
+          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
+            C:\Program Files\AMD\ROCm
+          key: ${{ matrix.install }}
+      - uses: actions/checkout@v4
+      - uses: actions/cache@v4
+        with:
+          path: ${{ github.workspace }}\.ccache
+          key: ccache-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
+      - name: Build target "${{ matrix.preset }}"
+        run: |
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          cmake --preset "${{ matrix.preset }}"
+          cmake --build --parallel --preset "${{ matrix.preset }}"
+          cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8
+        env:
+          CMAKE_GENERATOR: Ninja
       - uses: actions/upload-artifact@v4
         with:
-          name: generate-windows-cuda-${{ matrix.cuda.version }}
-          path: |
-            dist/windows-amd64/**
+          name: depends-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.preset }}
+          path: dist\*
 
-  # windows arm64 generate, go build, and zip file (no installer)
-  # Output of this build is aggregated into the final x86 build
-  # for a unified windows installer
-  windows-arm64:
-    runs-on: windows-arm64
+  windows-build:
+    strategy:
+      matrix:
+        os: [windows]
+        arch: [amd64, arm64]
+    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
     environment: release
+    needs: [setup-environment]
     env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
+      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
     steps:
-      # The current Windows arm64 beta image has effectively zero dev tools installed...
-      - name: Install git and gzip
+      - name: Install system dependencies
         run: |
-          Set-ExecutionPolicy Bypass -Scope Process -Force
-          [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
-          iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-          choco install -y --no-progress git gzip
-          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      # pacman is buggy on win arm64, so we avoid using it, but rely on the binary artifacts
-      # we download the sfx (7zip bundle) which isn't fully set up, but the binaries we need to build work
-      - name: Install msys2 x64
-        run: |
-          $url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-base-x86_64-20240727.sfx.exe"
-          write-host "Downloading MSYS2"
-          Invoke-WebRequest -Uri "$url" -outfile "${env:RUNNER_TEMP}\msys2.exe"
-          write-host "Installing msys2"
-          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @(
-              '-y', '-oC:\'
-              ) -NoNewWindow -Wait
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      # since pacman isn't reliable, we just download the tar file and extract directly
-      - name: Downloading and extracting msys2 make tar file
-        run: |
-          $url="https://mirror.msys2.org/msys/x86_64/make-4.4.1-2-x86_64.pkg.tar.zst"
-          write-host "Downloading make"
-          Invoke-WebRequest -Uri "$url" -outfile c:\msys64\make.tar.zst
-          cd c:\msys64; tar -xf make.tar.zst
-          rm c:\msys64\make.tar.zst
-      - name: Verify Make works properly
-        run: |
-          echo $env:PATH
-          make --version
-      - name: Install Visual Studio 2022
-        run: |
-          $components = @(
-            "Microsoft.VisualStudio.Component.CoreEditor",
-            "Microsoft.VisualStudio.Workload.CoreEditor",
-            "Microsoft.VisualStudio.Component.Roslyn.Compiler",
-            "Microsoft.Component.MSBuild",
-            "Microsoft.VisualStudio.Component.TextTemplating",
-            "Microsoft.VisualStudio.Component.Debugger.JustInTime",
-            "Microsoft.VisualStudio.Component.VC.CoreIde",
-            "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
-            "Microsoft.VisualStudio.Component.Windows11SDK.22621",
-            "Microsoft.VisualStudio.Component.VC.Tools.ARM64EC",
-            "Microsoft.VisualStudio.Component.VC.Tools.ARM64",
-            "Microsoft.VisualStudio.Component.VC.ATL",
-            "Microsoft.VisualStudio.Component.VC.ATL.ARM64",
-            "Microsoft.VisualStudio.Component.Graphics",
-            "Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-            "Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
-            "Microsoft.VisualStudio.Component.Windows11Sdk.WindowsPerformanceToolkit",
-            "Microsoft.VisualStudio.Component.CppBuildInsights",
-            "Microsoft.VisualStudio.Component.VC.DiagnosticTools",
-            "Microsoft.VisualStudio.ComponentGroup.WebToolsExtensions.CMake",
-            "Microsoft.VisualStudio.Component.VC.CMake.Project",
-            "Microsoft.VisualStudio.Component.VC.ASAN",
-            "Microsoft.VisualStudio.Component.Vcpkg",
-            "Microsoft.VisualStudio.Workload.NativeDesktop"
-          )
-          $config = @{
-                "version" = "1.0"
-                "components"  = $components
-                "extensions"  = @()
-            }
-          $configPath = "${env:RUNNER_TEMP}\vsconfig"
-          $config | ConvertTo-Json | Out-File -FilePath $configPath
-          $bootstrapperFilePath = "${env:RUNNER_TEMP}\vs_community.exe"
-          write-host "Downloading Visual Studio 2022"
-          Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_community.exe" -outfile $bootstrapperFilePath
-          $bootstrapperArgumentList = ('/c', $bootstrapperFilePath, '--config', $configPath, '--quiet', '--wait' )
-          write-host "Installing Visual Studio 2022"
-          $process = Start-Process -FilePath cmd.exe -ArgumentList $bootstrapperArgumentList -Wait -PassThru
-          $exitCode = $process.ExitCode
-          write-host $exitCode
-      # pacman in mingw/msys2 is ~broken on windows arm right now - hangs consistently during attempts to install
-      # so we'll use this alternative GCC binary
-      - name: Install llvm-mingw GCC
-        run: |
-          $gcc_url="https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip"
-          write-host "Downloading llvm-mingw"
-          Invoke-WebRequest -Uri "${gcc_url}" -OutFile "${env:RUNNER_TEMP}\gcc.zip"
-          write-host "Unpacking llvm-mingw"
-          expand-archive -path "${env:RUNNER_TEMP}\gcc.zip" -destinationpath "c:\"
-          mv c:\llvm-mingw-* c:\llvm-mingw
-          echo "c:\llvm-mingw\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Verify GCC
-        run: |
-          echo $env:PATH
-          gcc --version
+          $ErrorActionPreference = "Stop"
+          if ("${{ matrix.arch }}" -eq 'amd64') {
+            Start-Process "C:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
+            echo "C:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+            echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          } elseif ("${{ matrix.arch }}" -eq 'arm64') {
+            Set-ExecutionPolicy Bypass -Scope Process -Force
+            [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
+            iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
+            echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
+            choco install -y --no-progress git gzip
+            echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
+            Invoke-WebRequest -Uri "https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip" -OutFile "${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip"
+            Expand-Archive -Path ${{ runner.temp }}\llvm-mingw-ucrt-aarch64.zip -DestinationPath "C:\Program Files\"
+            $installPath=(Resolve-Path -Path "C:\Program Files\llvm-mingw-*-ucrt-aarch64").path
+            echo $installPath\bin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          }
       - uses: actions/checkout@v4
-      - name: Set Version
-        run: |
-          $ver=${env:GITHUB_REF_NAME}.trim("v")
-          echo VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" | Out-File -FilePath ollama_inc.crt -Encoding utf8
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
-          cache: true
-      - run: go get ./...
       - run: |
-          $gopath=(get-command go).source | split-path -parent
-          $gccpath=(get-command gcc).source | split-path -parent
-          import-module 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\Program Files\Microsoft Visual Studio\2022\Community' -skipautomaticlocation
-          $env:PATH="$gopath;$gccpath;$env:PATH"
-          echo $env:PATH
-          $env:ARCH="arm64"
-          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies sign distZip
-        name: 'Windows Build'
+          go build -o dist/${{ matrix.os }}-${{ matrix.arch }}/ .
+      - run: |
+          $env:VERSION='${{ github.ref_name }}' -Replace "v(.*)", '$1'
+          & .\scripts\build_windows.ps1 buildApp
+        env:
+          VCToolsRedistDir: stub
       - uses: actions/upload-artifact@v4
         with:
-          name: windows-arm64
+          name: build-${{ matrix.os }}-${{ matrix.arch }}
           path: |
-            dist/windows-arm64/**
-            dist/windows-arm64-app.exe
-            dist/ollama-windows-arm64.zip
+            dist\${{ matrix.os }}-${{ matrix.arch }}\*.exe
+            dist\${{ matrix.os }}-${{ matrix.arch }}-app.exe
 
-  # Import the prior generation steps plus the full arm64 build, and build the final windows assets
-  build-windows:
-    environment: release
+  windows-sign:
     runs-on: windows
-    needs:
-      - generate-windows-cuda
-      - generate-windows-rocm
-      - generate-windows-cpu
-      - windows-arm64
-    env:
-      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
+    environment: release
+    needs: [windows-depends, windows-build]
     steps:
       - uses: actions/checkout@v4
+      - uses: google-github-actions/auth@v2
         with:
-          submodules: recursive
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
-      - name: Install msys2
-        run: |
-          $msys2_url="https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe"
-          write-host "Downloading msys2"
-          Invoke-WebRequest -Uri "${msys2_url}" -OutFile "${env:RUNNER_TEMP}\msys2.exe"
-          write-host "Installing msys2"
-          Start-Process "${env:RUNNER_TEMP}\msys2.exe" -ArgumentList @("in", "--confirm-command", "--accept-messages", "--root", "C:/msys64") -NoNewWindow -Wait
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang", "make") -NoNewWindow -Wait
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: verify tools
-        run: |
-          get-command gcc
-          gcc --version
-          get-command make
-          make --version
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get
-      - uses: actions/download-artifact@v4
-        with:
-          name: generate-windows-cpu
-          path: dist/windows-amd64/
-      - uses: actions/download-artifact@v4
-        with:
-          name: generate-windows-cuda-11.3
-          path: dist/windows-amd64/
-      - uses: actions/download-artifact@v4
-        with:
-          name: generate-windows-cuda-12.4
-          path: dist/windows-amd64/
-      - uses: actions/download-artifact@v4
-        with:
-          name: generate-windows-rocm
-          path: dist/windows-amd64/
-      - uses: actions/download-artifact@v4
-        with:
-          name: windows-arm64
-          path: dist
+          project_id: ollama
+          credentials_json: ${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}
       - run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          $env:OLLAMA_SKIP_GENERATE="1"
-          $env:ARCH="amd64"
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          & .\scripts\build_windows.ps1
+          $ErrorActionPreference = "Stop"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${{ runner.temp }}\sdksetup.exe"
+          Start-Process "${{ runner.temp }}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${{ runner.temp }}\plugin.zip"
+          Expand-Archive -Path "${{ runner.temp }}\plugin.zip" -DestinationPath "${{ runner.temp }}\plugin\"
+          & "${{ runner.temp }}\plugin\*\kmscng.msi" /quiet
+
+          echo "${{ vars.OLLAMA_CERT }}" >ollama_inc.crt
+      - uses: actions/download-artifact@v4
+        with:
+          name: build-windows-*
+          path: dist\
+          merge-multiple: true
+      - uses: actions/download-artifact@v4
+        with:
+          name: depends-windows-amd64-*
+          path: dist\windows-amd64\
+          merge-multiple: true
+      - run: |
+          & .\scripts\build_windows.ps1 gatherDependencies sign buildInstaller distZip
       - uses: actions/upload-artifact@v4
         with:
           name: dist-windows
           path: |
-            dist/OllamaSetup.exe
-            dist/ollama-windows-*.zip
+            dist\OllamaSetup.exe
+            dist\ollama-windows-*.zip
 
-  # Linux x86 assets built using the container based build
-  build-linux-amd64:
-    environment: release
-    runs-on: linux
-    env:
-      PLATFORM: linux/amd64
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - run: |
-          ./scripts/build_linux.sh
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-linux-amd64
-          path: |
-            dist/*linux*
-            !dist/*-cov
-
-  # Linux ARM assets built using the container based build
-  # (at present, docker isn't pre-installed on arm ubunutu images)
-  build-linux-arm64:
-    environment: release
-    runs-on: linux-arm64
-    env:
-      PLATFORM: linux/arm64
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Set Version
-        shell: bash
-        run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: 'Install Docker'
-        run: |
-          # Add Docker's official GPG key:
-          env
-          uname -a
-          sudo apt-get update
-          sudo apt-get install -y ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-
-          # Add the repository to Apt sources:
-          echo \
-            "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt-get update
-          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
-          sudo usermod -aG docker $USER
-          sudo apt-get install acl
-          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
-      - run: |
-          ./scripts/build_linux.sh
-      - uses: actions/upload-artifact@v4
-        with:
-          name: dist-linux-arm64
-          path: |
-            dist/*linux*
-            !dist/*-cov
-
-  # Container image build
-  build-container-image:
-    environment: release
+  linux-build:
     strategy:
       matrix:
-        runner:
-          - linux
-          - linux-arm64
-    runs-on: ${{ matrix.runner }}
-    env:
-      FINAL_IMAGE_REPO: ollama/ollama
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: 'Install Docker'
-        if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-          echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt-get update
-          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
-          sudo usermod -aG docker $USER
-          sudo apt-get install acl
-          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.FINAL_IMAGE_REPO }}
-          flavor: |
-            latest=false
-          tags: |
-            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
-            type=semver,pattern={{version}}
-      - name: Set Version
-        shell: bash
-        run: |
-          machine=$(uname -m)
-          case ${machine} in
-            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
-            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
-          esac >>$GITHUB_ENV
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - name: Build and push by digest
-        id: build
-        uses: docker/build-push-action@v6
-        with:
-          context: "."
-          platforms: linux/${{ env.ARCH }}
-          build-args: |
-            GOFLAGS
-          outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
-      - name: Export digest
-        run: |
-          mkdir -p /tmp/digests
-          digest="${{ steps.build.outputs.digest }}"
-          touch "/tmp/digests/${digest#sha256:}"
-      - name: Upload digest
-        uses: actions/upload-artifact@v4
-        with:
-          name: digests-${{ env.PLATFORM_PAIR }}
-          path: /tmp/digests/*
-          if-no-files-found: error
-          retention-days: 1
-  merge:
+        include:
+          - os: linux
+            arch: amd64
+            targets: 'archive rocm'
+          - os: linux
+            arch: arm64
+            targets: archive
+    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
     environment: release
-    runs-on: linux
-    needs:
-      - build-container-image
+    needs: setup-environment
     env:
-      FINAL_IMAGE_REPO: ollama/ollama
+      GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
     steps:
       - uses: actions/checkout@v4
+      - uses: docker/setup-buildx-action@v3
+      - run: |
+          apt-get update && apt-get install pigz
+          for TARGET in ${{ matrix.targets }}; do docker buildx build --platform $PLATFORM --target $TARGET --output type=local,dest=dist/$PLATFORM .; done
+          tar c -C dist/$PLATFORM . | pigz -9cv >dist/ollama-${PLATFORM//\//-}.tgz
+        env:
+          PLATFORM: ${{ matrix.os }}/${{ matrix.arch }}
+      - uses: actions/upload-artifact@v4
         with:
-          submodules: recursive
-      - name: Download digests
-        uses: actions/download-artifact@v4
-        with:
-          path: /tmp/digests
-          pattern: digests-*
-          merge-multiple: true
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.FINAL_IMAGE_REPO }}
-          flavor: |
-            latest=false
-          tags: |
-            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
-            type=semver,pattern={{version}}
-      - name: Set Version
-        shell: bash
-        run: |
-          machine=$(uname -m)
-          case ${machine} in
-            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
-            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
-          esac >>$GITHUB_ENV
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - name: Create manifest list and push
-        working-directory: /tmp/digests
-        run: |
-          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
-            $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
-      - name: Inspect image
-        run: |
-          docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}          
-  build-container-image-rocm:
+          name: dist-${{ matrix.os }}-${{ matrix.arch }}
+          path: |
+            dist/ollama-${{ matrix.os }}-${{ matrix.arch }}.tgz
+
+  docker-build:
+    strategy:
+      matrix:
+        include:
+          - flavor: 'latest=false'
+            platforms: linux/amd64,linux/arm64
+            build-args: |
+              GOFLAGS=${{ needs.setup-environment.outputs.GOFLAGS }}
+          - flavor: 'latest=false,suffix=rocm'
+            platforms: linux/amd64
+            build-args: |
+              GOFLAGS=${{ needs.setup-environment.outputs.GOFLAGS }}
+              FLAVOR=rocm
+    runs-on: linux
     environment: release
-    runs-on: linux
-    env:
-      FINAL_IMAGE_REPO: ollama/ollama
-      ARCH: amd64
-      PLATFORM_PAIR: linux-amd64
+    needs: setup-environment
     steps:
       - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.FINAL_IMAGE_REPO }}
-          flavor: |
-            latest=false
-          tags: |
-            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
-            type=semver,pattern={{version}}
-      - name: Set Version
-        shell: bash
-        run: |
-          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
+      - uses: docker/setup-qemu-action@v2
+      - uses: docker/setup-buildx-action@v2
+      - uses: docker/login-action@v3
         with:
           username: ${{ vars.DOCKER_USER }}
           password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - name: Build and push by digest
-        id: build
-        uses: docker/build-push-action@v6
+      - id: metadata
+        uses: docker/metadata-action@v4
         with:
-          context: "."
-          target: runtime-rocm
-          build-args: |
-            GOFLAGS
-          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm
+          flavor: ${{ matrix.flavor }}
+          images: |
+            ollama/ollama
+          tags: |
+            type=semver,pattern={{version}}
+      - uses: docker/build-push-action@v6
+        with:
+          context: .
           push: true
+          platforms: ${{ matrix.platforms }}
+          build-args: ${{ matrix.build-args }}
+          tags: ${{ steps.metadata.outputs.tags }}
+          labels: ${{ steps.metadata.outputs.labels }}
+          cache-from: type=registry,ref=ollama/ollama:latest
+          cache-to: type=inline
+          provenance: false
 
   # Aggregate all the assets and ship a release
   release:
-    needs:
-      - build-darwin
-      - build-windows
-      - build-linux-amd64
-      - build-linux-arm64
+    needs: [darwin-sign, windows-sign, linux-build]
     runs-on: linux
     environment: release
     permissions:
@@ -734,14 +359,22 @@ jobs:
       - name: Set Version
         shell: bash
         run: |
-          echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-          echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
-      - name: Retrieve built artifact
-        uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v4
         with:
           path: dist
-          pattern: dist-*
-          merge-multiple: true
+          pattern: dist-darwin
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-windows
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-linux-*
+      - uses: actions/download-artifact@v4
+        with:
+          path: dist
+          pattern: dist-windows
       - run: |
           ls -lh dist/
           (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
@@ -749,15 +382,17 @@ jobs:
           cat dist/sha256sum.txt
       - name: Create or update Release
         run: |
-          echo "Looking for existing release for ${{ env.RELEASE_VERSION }}"
-          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${{ env.RELEASE_VERSION }}\") | .tagName")
+          RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)"
+
+          echo "Looking for existing release for ${RELEASE_VERSION}"
+          OLD_TAG=$(gh release ls --json name,tagName | jq -r ".[] | select(.name == \"${RELEASE_VERSION}\") | .tagName")
           if [ -n "$OLD_TAG" ]; then
-            echo "Updating release ${{ env.RELEASE_VERSION }} to point to new tag ${GITHUB_REF_NAME}"
+            echo "Updating release ${RELEASE_VERSION} to point to new tag ${GITHUB_REF_NAME}"
             gh release edit ${OLD_TAG} --tag ${GITHUB_REF_NAME}
           else
-            echo "Creating new release ${{ env.RELEASE_VERSION }} pointing to tag ${GITHUB_REF_NAME}"
+            echo "Creating new release ${RELEASE_VERSION} pointing to tag ${GITHUB_REF_NAME}"
             gh release create ${GITHUB_REF_NAME} \
-              --title ${{ env.RELEASE_VERSION }} \
+              --title ${RELEASE_VERSION} \
               --draft \
               --generate-notes \
               --prerelease
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8dcc506ba..f8e1cadf4 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,11 +1,5 @@
 name: test
 
-env:
-  ROCM_WINDOWS_URL: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-  MSYS2_URL: https://github.com/msys2/msys2-installer/releases/download/2024-07-27/msys2-x86_64-20240727.exe
-  CUDA_12_WINDOWS_URL: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
-  CUDA_12_WINDOWS_VER: 12.4
-
 concurrency:
   # For PRs, later CI runs preempt previous ones. e.g. a force push on a PR
   # cancels running CI jobs and starts all new ones.
@@ -27,7 +21,7 @@ jobs:
   changes:
     runs-on: ubuntu-latest
     outputs:
-      RUNNERS: ${{ steps.changes.outputs.RUNNERS }}
+      changed: ${{ steps.changes.outputs.changed }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -35,309 +29,139 @@ jobs:
       - id: changes
         run: |
           changed() {
-            git diff-tree -r --no-commit-id --name-only \
-              $(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) \
-              ${{ github.event.pull_request.head.sha }} \
+            local BASE=${{ github.event.pull_request.base.sha }}
+            local HEAD=${{ github.event.pull_request.head.sha }}
+            local MERGE_BASE=$(git merge-base $BASE $HEAD)
+            git diff-tree -r --no-commit-id --name-only "$MERGE_BASE" "$HEAD" \
               | xargs python3 -c "import sys; from pathlib import Path; print(any(Path(x).match(glob) for x in sys.argv[1:] for glob in '$*'.split(' ')))"
           }
 
-          {
-            echo RUNNERS=$(changed 'llama/**')
-          } >>$GITHUB_OUTPUT
+          echo changed=$(changed 'llama/llama.cpp/**' 'ml/backend/ggml/ggml/**') | tee -a $GITHUB_OUTPUT
 
-  runners-linux-cuda:
+  linux:
     needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
+    if: needs.changes.outputs.changed == 'True'
     strategy:
       matrix:
-        cuda-version:
-          - '11.8.0'
+        include:
+          - preset: CPU
+          - preset: CUDA
+            container: nvidia/cuda:11.8.0-devel-ubuntu22.04
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
+          - preset: ROCm
+            container: rocm/dev-ubuntu-22.04:6.1.2
+            extra-packages: rocm-libs
+            flags: '-DAMDGPU_TARGETS=gfx1010 -DCMAKE_PREFIX_PATH=/opt/rocm'
     runs-on: linux
-    container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04
+    container: ${{ matrix.container }}
     steps:
+      - uses: actions/checkout@v4
       - run: |
-          apt-get update && apt-get install -y git build-essential curl
+          [ -n "${{ matrix.container }}" ] || sudo=sudo
+          $sudo apt-get update
+          $sudo apt-get install -y cmake ccache ${{ matrix.extra-packages }}
         env:
           DEBIAN_FRONTEND: noninteractive
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v4
+      - uses: actions/cache@v4
         with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get ./...
+          path: /github/home/.cache/ccache
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
       - run: |
-          git config --global --add safe.directory /__w/ollama/ollama
-          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-          make -j $cores cuda_v11
-  runners-linux-rocm:
+          cmake --preset ${{ matrix.preset }} ${{ matrix.flags }}
+          cmake --build --preset ${{ matrix.preset }} --parallel
+
+  windows:
     needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
+    if: needs.changes.outputs.changed == 'True'
     strategy:
       matrix:
-        rocm-version:
-          - '6.1.2'
-    runs-on: linux
-    container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}
-    steps:
-      - run: |
-          apt-get update && apt-get install -y git build-essential curl rocm-libs
-        env:
-          DEBIAN_FRONTEND: noninteractive
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v4
-        with:
-          go-version-file: go.mod
-          cache: true
-      - run: go get ./...
-      - run: |
-          git config --global --add safe.directory /__w/ollama/ollama
-          cores=$(grep '^core id' /proc/cpuinfo |sort -u|wc -l)
-          make -j $cores rocm
-
-  # ROCm generation step
-  runners-windows-rocm:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
+        include:
+          - preset: CPU
+          - preset: CUDA
+            install: https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
+          - preset: ROCm
+            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
+            flags: '-DAMDGPU_TARGETS=gfx1010'
     runs-on: windows
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
+      - run: |
+          choco install -y --no-progress ccache ninja
+          ccache -o cache_dir=${{ github.workspace }}\.ccache
+      - if: matrix.preset == 'CUDA' || matrix.preset == 'ROCm'
+        id: cache-install
+        uses: actions/cache/restore@v4
         with:
-          go-version-file: go.mod
-          cache: true
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      # ROCM installation steps
-      - name: 'Cache ROCm installer'
-        id: cache-rocm
-        uses: actions/cache@v4
-        with:
-          path: rocm-install.exe
-          key: ${{ env.ROCM_WINDOWS_URL }}
-      - name: 'Conditionally Download ROCm'
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
+            C:\Program Files\AMD\ROCm
+          key: ${{ matrix.install }}
+      - if: matrix.preset == 'CUDA'
+        name: Install CUDA ${{ matrix.cuda-version }}
         run: |
           $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:ROCM_WINDOWS_URL}" -OutFile "rocm-install.exe"
-      - name: 'Install ROCm'
-        run: |
-          Start-Process "rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-      - name: 'Verify ROCm'
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path | select -first 1)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.8", "nvcc_11.8", "cublas_11.8", "cublas_dev_11.8")) -NoNewWindow -Wait
+          }
 
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-
-      - name: make rocm runner
-        run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          make rocm
-
-  # CUDA generation step
-  runners-windows-cuda:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
-    runs-on: windows
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: true
-      - name: Set make jobs default
-        run: |
-          echo "MAKEFLAGS=--jobs=$((Get-ComputerInfo -Property CsProcessors).CsProcessors.NumberOfCores)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-
-      # CUDA installation steps
-      - name: 'Cache CUDA installer'
-        id: cache-cuda
-        uses: actions/cache@v4
-        with:
-          path: cuda-install.exe
-          key: ${{ env.CUDA_12_WINDOWS_URL }}
-      - name: 'Conditionally Download CUDA'
-        if: steps.cache-cuda.outputs.cache-hit != 'true'
-        run: |
-          $ErrorActionPreference = "Stop"
-          Invoke-WebRequest -Uri "${env:CUDA_12_WINDOWS_URL}" -OutFile "cuda-install.exe"
-      - name: 'Install CUDA'
-        run: |
-          $subpackages = @("cudart", "nvcc", "cublas", "cublas_dev") | foreach-object {"${_}_${{ env.CUDA_12_WINDOWS_VER }}"}
-          Start-Process "cuda-install.exe" -ArgumentList (@("-s") + $subpackages) -NoNewWindow -Wait
-      - name: 'Verify CUDA'
-        run: |
-          & (resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0] --version
-          $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path)
-          $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' 
+          $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
           echo "$cudaPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_V${cudaVer}=$cudaPath" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+      - if: matrix.preset == 'ROCm'
+        name: Install ROCm ${{ matrix.rocm-version }}
+        run: |
+          $ErrorActionPreference = "Stop"
+          if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
+            Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
+            Start-Process -FilePath .\install.exe -ArgumentList '-install' -NoNewWindow -Wait
+          }
 
-      - name: Add msys paths
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - name: make cuda runner
-        run: |
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
-
-  runners-cpu:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64, arm64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
-    runs-on: ${{ matrix.os }}
-    env:
-      GOARCH: ${{ matrix.arch }}
-      ARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
-    steps:
+          $hipPath = (Resolve-Path "C:\Program Files\AMD\ROCm\*").path
+          echo "$hipPath\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CC=$hipPath\bin\clang.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+          echo "CXX=$hipPath\bin\clang++.exe" | Out-File -FilePath $env:GITHUB_ENV -Append
+      - if: ${{ !cancelled() && steps.cache-install.outputs.cache-hit != 'true' }}
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
+            C:\Program Files\AMD\ROCm
+          key: ${{ matrix.install }}
       - uses: actions/checkout@v4
-      - uses: actions/setup-go@v5
+      - uses: actions/cache@v4
         with:
-          go-version-file: go.mod
-          cache: true
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - name: 'Build Windows Go Runners'
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          $gopath=(get-command go).source | split-path -parent
-          $gccpath=(get-command gcc).source | split-path -parent
-          import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-          Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
-          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
-          $env:PATH="$gopath;$gccpath;$env:PATH"
-          echo $env:PATH
-          if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -j 4
-      - name: 'Build Unix Go Runners'
-        if: ${{ ! startsWith(matrix.os, 'windows-') }}
-        run: make -j 4
-      - run: go build .
-
-  lint:
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64, arm64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
-          - os: macos-latest
-            arch: amd64
-    runs-on: ${{ matrix.os }}
-    env:
-      GOARCH: ${{ matrix.arch }}
-      CGO_ENABLED: '1'
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
-      - uses: actions/setup-go@v5
-        with:
-          go-version-file: go.mod
-          cache: false
+          path: ${{ github.workspace }}\.ccache
+          key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}
       - run: |
-          case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
-            arm64) echo ARCH=arm64 ;;
-          esac >>$GITHUB_ENV
-        shell: bash
-      - uses: golangci/golangci-lint-action@v6
-        with:
-          args: --timeout 10m0s -v
+          Import-Module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
+          Enter-VsDevShell -VsInstallPath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
+          cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
+          cmake --build --parallel --preset "${{ matrix.preset }}"
+        env:
+          CMAKE_GENERATOR: Ninja
+
   test:
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        arch: [amd64]
-        exclude:
-          - os: ubuntu-latest
-            arch: arm64
-          - os: windows-2019
-            arch: arm64
+        os: [ubuntu-latest, macos-latest, windows-latest]
     runs-on: ${{ matrix.os }}
     env:
-      GOARCH: ${{ matrix.arch }}
       CGO_ENABLED: '1'
     steps:
       - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Add msys paths
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Install msys2 tools
-        if: ${{ startsWith(matrix.os, 'windows-') }}
-        run: |
-          Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
-          cache: true
-      - run: |
-          case ${{ matrix.arch }} in
-            amd64) echo ARCH=amd64 ;;
-            arm64) echo ARCH=arm64 ;;
-          esac >>$GITHUB_ENV
-        shell: bash
+      - uses: golangci/golangci-lint-action@v6
+        with:
+          args: --timeout 10m0s -v
       - run: go test ./...
 
   patches:
-    needs: [changes]
-    if: ${{ needs.changes.outputs.RUNNERS == 'True' }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Verify patches carry all the changes
+      - name: Verify patches apply cleanly and do not change files
         run: |
-          make apply-patches sync && git diff --compact-summary --exit-code llama
+          make -f Makefile.sync clean checkout sync
+          git diff --compact-summary --exit-code
diff --git a/.gitignore b/.gitignore
index caa62a524..551abec87 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,12 +4,13 @@
 .venv
 .swp
 dist
+build
 ollama
 .cache
 *.exe
 .idea
 test_data
 *.crt
-llama/build
 __debug_bin*
-llama/vendor
\ No newline at end of file
+llama/build
+llama/vendor
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..19d9bd8f9
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,112 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(Ollama C CXX)
+
+include(CheckLanguage)
+
+find_package(Threads REQUIRED)
+
+set(CMAKE_BUILD_TYPE Release)
+set(BUILD_SHARED_LIBS ON)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(GGML_BUILD ON)
+set(GGML_SHARED ON)
+set(GGML_CCACHE ON)
+set(GGML_BACKEND_DL ON)
+set(GGML_BACKEND_SHARED ON)
+set(GGML_SCHED_MAX_COPIES 4)
+
+set(GGML_LLAMAFILE ON)
+set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
+set(GGML_CUDA_GRAPHS ON)
+
+if((NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+    OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+"))
+    set(GGML_CPU_ALL_VARIANTS ON)
+endif()
+
+set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama)
+set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY         ${OLLAMA_BUILD_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG   ${OLLAMA_BUILD_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${OLLAMA_BUILD_DIR})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
+
+set(GGML_CPU ON)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
+set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
+
+get_target_property(CPU_VARIANTS ggml-cpu MANUALLY_ADDED_DEPENDENCIES)
+if(NOT CPU_VARIANTS)
+    set(CPU_VARIANTS "ggml-cpu")
+endif()
+
+install(TARGETS ggml-base ${CPU_VARIANTS}
+    RUNTIME_DEPENDENCIES
+        PRE_EXCLUDE_REGEXES ".*"
+    RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
+    LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
+    FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CPU
+)
+
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24" AND NOT CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES "native")
+    endif()
+
+    find_package(CUDAToolkit)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
+    set(OLLAMA_CUDA_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/cuda_v${CUDAToolkit_VERSION_MAJOR})
+    install(TARGETS ggml-cuda
+        RUNTIME_DEPENDENCIES
+            DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR}
+            PRE_INCLUDE_REGEXES cublas cublasLt cudart
+            PRE_EXCLUDE_REGEXES ".*"
+        RUNTIME DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+        LIBRARY DESTINATION ${OLLAMA_CUDA_INSTALL_DIR} COMPONENT CUDA
+    )
+endif()
+
+check_language(HIP)
+if(CMAKE_HIP_COMPILER)
+    set(HIP_PLATFORM "amd")
+
+    find_package(hip REQUIRED)
+    if(NOT AMDGPU_TARGETS)
+        list(FILTER AMDGPU_TARGETS INCLUDE REGEX "^gfx(900|94[012]|101[02]|1030|110[012])$")
+    endif()
+
+    if(AMDGPU_TARGETS)
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+        set(OLLAMA_HIP_INSTALL_DIR ${OLLAMA_INSTALL_DIR}/rocm)
+        install(TARGETS ggml-hip
+            RUNTIME_DEPENDENCIES
+                DIRECTORIES ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR}
+                PRE_INCLUDE_REGEXES amdhip64 hipblas rocblas amd_comgr hsa_runtime64 rocprofiler-register drm_amdgpu drm numa
+                PRE_EXCLUDE_REGEXES ".*"
+                POST_EXCLUDE_REGEXES "system32"
+            RUNTIME DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
+            LIBRARY DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP
+        )
+
+        foreach(HIP_LIB_BIN_INSTALL_DIR IN ITEMS ${HIP_BIN_INSTALL_DIR} ${HIP_LIB_INSTALL_DIR})
+            if(EXISTS ${HIP_LIB_BIN_INSTALL_DIR}/rocblas)
+                install(DIRECTORY ${HIP_LIB_BIN_INSTALL_DIR}/rocblas DESTINATION ${OLLAMA_HIP_INSTALL_DIR} COMPONENT HIP)
+                break()
+            endif()
+        endforeach()
+    endif()
+endif()
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 000000000..e6d3f6e7e
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,110 @@
+{
+  "version": 3,
+  "configurePresets": [
+    {
+      "name": "Default",
+      "binaryDir": "${sourceDir}/build",
+      "installDir": "${sourceDir}/dist",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release"
+      }
+    },
+    {
+      "name": "CPU",
+      "inherits": [ "Default" ]
+    },
+    {
+      "name": "CUDA",
+      "inherits": [ "Default" ]
+    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "50;52;53;60;61;62;70;72;75;80;86"
+      }
+    },
+    {
+      "name": "CUDA 12",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "60;61;62;70;72;75;80;86;87;89;90;90a"
+      }
+    },
+    {
+      "name": "JetPack 5",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "72;87"
+      }
+    },
+    {
+      "name": "JetPack 6",
+      "inherits": [ "CUDA" ],
+      "cacheVariables": {
+        "CMAKE_CUDA_ARCHITECTURES": "87"
+      }
+    },
+    {
+      "name": "ROCm",
+      "inherits": [ "Default" ],
+      "cacheVariables": {
+        "CMAKE_HIP_PLATFORM": "amd"
+      }
+    },
+    {
+      "name": "ROCm 6",
+      "inherits": [ "ROCm" ],
+      "cacheVariables": {
+        "AMDGPU_TARGETS": "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "Default",
+      "configurePreset": "Default",
+      "configuration": "Release"
+    },
+    {
+      "name": "CPU",
+      "configurePreset": "Default",
+      "targets": [ "ggml-cpu" ]
+    },
+    {
+      "name": "CUDA",
+      "configurePreset": "CUDA",
+      "targets": [ "ggml-cuda" ]
+    },
+    {
+      "name": "CUDA 11",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 11"
+    },
+    {
+      "name": "CUDA 12",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "CUDA 12"
+    },
+    {
+      "name": "JetPack 5",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "JetPack 5"
+    },
+    {
+      "name": "JetPack 6",
+      "inherits": [ "CUDA" ],
+      "configurePreset": "JetPack 6"
+    },
+    {
+      "name": "ROCm",
+      "configurePreset": "ROCm",
+      "targets": [ "ggml-hip" ]
+    },
+    {
+      "name": "ROCm 6",
+      "inherits": [ "ROCm" ],
+      "configurePreset": "ROCm 6"
+    }
+  ]
+}
diff --git a/Dockerfile b/Dockerfile
index 47228df61..0a8cb99fd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,201 +1,128 @@
-ARG GOLANG_VERSION=1.22.8
-ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_VERSION_12=12.4.0
-ARG ROCM_VERSION=6.1.2
-ARG JETPACK_6=r36.2.0
-ARG JETPACK_5=r35.4.1
+# vim: filetype=dockerfile
 
-### To create a local image for building linux binaries on mac or windows with efficient incremental builds
-#
-# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile --target unified-builder-amd64 .
-# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
-#
-### Then incremental builds will be much faster in this container
-#
-# make -j 10 dist
-#
-FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-# TODO intel oneapi goes here...
-ENV GOARCH amd64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
+ARG FLAVOR=${TARGETARCH}
 
-### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
-# Note: this does not contain jetson variants
-#
-# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile --target unified-builder-arm64 .
-# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
-#
-FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
-ARG GOLANG_VERSION
-ARG CUDA_VERSION_11
-ARG CUDA_VERSION_12
-COPY ./scripts/rh_linux_deps.sh /
-RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
-    dnf config-manager --set-enabled appstream && \
-    dnf clean all && \
-    dnf install -y \
-    zsh \
-    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH arm64
-ENV CGO_ENABLED 1
-WORKDIR /go/src/github.com/ollama/ollama/
-ENTRYPOINT [ "zsh" ]
+ARG ROCMVERSION=6.1.2
+ARG JETPACK5VERSION=r35.4.1
+ARG JETPACK6VERSION=r36.2.0
+ARG CMAKEVERSION=3.31.2
 
-FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG OLLAMA_FAST_BUILD
-ARG VERSION
-ARG CUSTOM_CPU_FLAGS
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCMVERSION}-complete AS base-amd64
+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
+    && yum install -y yum-utils devtoolset-10-gcc devtoolset-10-gcc-c++ \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo \
+    && curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
+ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:/opt/rh/devtoolset-11/root/usr/bin:$PATH
+
+FROM --platform=linux/arm64 rockylinux:8 AS base-arm64
+# install epel-release for ccache
+RUN yum install -y yum-utils epel-release \
+    && yum install -y clang ccache \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
+ENV CC=clang CXX=clang++
+
+FROM base-${TARGETARCH} AS base
+ARG CMAKEVERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+ENV LDFLAGS=-s
+
+FROM base AS cpu
+# amd64 uses gcc which requires devtoolset-11 for AVX extensions while arm64 uses clang
+RUN if [ "$(uname -m)" = "x86_64" ]; then yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++; fi
+ENV PATH=/opt/rh/devtoolset-11/root/usr/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -j $(nproc) dist ; \
-    else \
-        make -j 5 dist ; \
-    fi
-RUN cd dist/linux-$GOARCH && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
-    fi
+    cmake --preset 'CPU' \
+        && cmake --build --parallel --preset 'CPU' \
+        && cmake --install build --component CPU --strip --parallel 8
 
-# Jetsons need to be built in discrete stages
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
-ARG GOLANG_VERSION
-RUN apt-get update && apt-get install -y git curl ccache && \
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
-    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-WORKDIR /go/src/github.com/ollama/ollama/
-COPY . .
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ARG VERSION
+FROM base AS cuda-11
+ARG CUDA11VERSION=11.3
+RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
+ENV PATH=/usr/local/cuda-11/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v11 \
-        CUDA_ARCHITECTURES="72;87" \
-        GPU_RUNNER_VARIANT=_jetpack5 \
-        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
-        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
+    cmake --preset 'CUDA 11' \
+        && cmake --build --parallel --preset 'CUDA 11' \
+        && cmake --install build --component CUDA --strip --parallel 8
 
-FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS runners-jetpack6-arm64
-ARG GOLANG_VERSION
-RUN apt-get update && apt-get install -y git curl ccache && \
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz | tar xz -C /usr/local && \
-    ln -s /usr/local/go/bin/go /usr/local/bin/go && \
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-WORKDIR /go/src/github.com/ollama/ollama/
-COPY . .
-ARG CGO_CFLAGS
-ENV GOARCH arm64
-ARG VERSION
+FROM base AS cuda-12
+ARG CUDA12VERSION=12.4
+RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
+ENV PATH=/usr/local/cuda-12/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist_cuda_v12 \
-        CUDA_ARCHITECTURES="87" \
-        GPU_RUNNER_VARIANT=_jetpack6 \
-        DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
-        DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
+    cmake --preset 'CUDA 12' \
+        && cmake --build --parallel --preset 'CUDA 12' \
+        && cmake --install build --component CUDA --strip --parallel 8
 
-FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_FAST_BUILD
-ARG VERSION
+FROM base AS rocm-6
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 dist
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-RUN cd dist/linux-$GOARCH && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN cd dist/linux-$GOARCH-jetpack5 && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
-RUN cd dist/linux-$GOARCH-jetpack6 && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
+    cmake --preset 'ROCm 6' \
+        && cmake --build --parallel --preset 'ROCm 6' \
+        && cmake --install build --component HIP --strip --parallel 8
 
-FROM --platform=linux/amd64 scratch AS dist-amd64
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM --platform=linux/arm64 scratch AS dist-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH AS dist
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK5VERSION} AS jetpack-5
+ARG CMAKEVERSION
+RUN apt-get update && apt-get install -y curl ccache \
+    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'JetPack 5' \
+        && cmake --build --parallel --preset 'JetPack 5' \
+        && cmake --install build --component CUDA --strip --parallel 8
 
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK6VERSION} AS jetpack-6
+ARG CMAKEVERSION
+RUN apt-get update && apt-get install -y curl ccache \
+    && curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKEVERSION}/cmake-${CMAKEVERSION}-linux-$(uname -m).tar.gz | tar xz -C /usr/local --strip-components 1
+COPY CMakeLists.txt CMakePresets.json .
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+RUN --mount=type=cache,target=/root/.ccache \
+    cmake --preset 'JetPack 6' \
+        && cmake --build --parallel --preset 'JetPack 6' \
+        && cmake --install build --component CUDA --strip --parallel 8
 
-# For amd64 container images, filter out cuda/rocm to minimize size
-FROM build-amd64 AS runners-cuda-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
-    ./dist/linux-amd64/lib/ollama/runners/rocm*
+FROM base AS build
+ARG GOVERSION=1.23.4
+RUN curl -fsSL https://golang.org/dl/go${GOVERSION}.linux-$(case $(uname -m) in x86_64) echo amd64 ;; aarch64) echo arm64 ;; esac).tar.gz | tar xz -C /usr/local
+ENV PATH=/usr/local/go/bin:$PATH
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS="'-ldflags=-w -s'"
+ENV CGO_ENABLED=1
+RUN --mount=type=cache,target=/root/.cache/go-build \
+    go build -trimpath -buildmode=pie -o /bin/ollama .
 
-FROM build-amd64 AS runners-rocm-amd64
-RUN rm -rf \
-    ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
-    ./dist/linux-amd64/lib/ollama/libcu*.so* \
-    ./dist/linux-amd64/lib/ollama/runners/cuda*
+FROM --platform=linux/amd64 scratch AS amd64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
 
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+FROM --platform=linux/arm64 scratch AS arm64
+COPY --from=cuda-11 dist/lib/ollama/cuda_v11 /lib/ollama/cuda_v11
+COPY --from=cuda-12 dist/lib/ollama/cuda_v12 /lib/ollama/cuda_v12
+COPY --from=jetpack-5 dist/lib/ollama/cuda_v11 lib/ollama/cuda_jetpack5
+COPY --from=jetpack-6 dist/lib/ollama/cuda_v12 lib/ollama/cuda_jetpack6
 
-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
+FROM --platform=linux/arm64 scratch AS rocm
+COPY --from=rocm-6 dist/lib/ollama/rocm /lib/ollama/rocm
 
+FROM ${FLAVOR} AS archive
+COPY --from=cpu dist/lib/ollama /lib/ollama
+COPY --from=build /bin/ollama /bin/ollama
 
-# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
-FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
-# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
-# across releases
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
-
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
-
-ENTRYPOINT ["/bin/ollama"]
-CMD ["serve"]
-
-FROM runtime-$TARGETARCH
-EXPOSE 11434
-ENV OLLAMA_HOST 0.0.0.0
+FROM ubuntu:20.04
+RUN apt-get update \
+    && apt-get install -y ca-certificates \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+COPY --from=archive /bin /usr/bin
 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+COPY --from=archive /lib/ollama /usr/lib/ollama
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_VISIBLE_DEVICES=all
-
+ENV OLLAMA_HOST=0.0.0.0:11434
+EXPOSE 11434
 ENTRYPOINT ["/bin/ollama"]
 CMD ["serve"]
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 383354ee9..000000000
--- a/Makefile
+++ /dev/null
@@ -1,103 +0,0 @@
-# top level makefile for Ollama
-include make/common-defs.make
-
-
-# Determine which if any GPU runners we should build
-include make/cuda-v11-defs.make
-include make/cuda-v12-defs.make
-include make/rocm-defs.make
-
-ifeq ($(CUSTOM_CPU_FLAGS),)
-ifeq ($(ARCH),amd64)
-	RUNNER_TARGETS=cpu
-endif
-# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
-ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
-ifneq ($(CUDA_11_COMPILER),)
-	RUNNER_TARGETS += cuda_v11
-endif
-ifneq ($(CUDA_12_COMPILER),)
-	RUNNER_TARGETS += cuda_v12
-endif
-endif
-else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
-ifneq ($(CUDA_12_COMPILER),)
-	RUNNER_TARGETS += cuda_v12
-else ifneq ($(CUDA_11_COMPILER),)
-	RUNNER_TARGETS += cuda_v11
-endif
-endif
-
-ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
-ifneq ($(HIP_COMPILER),)
-	RUNNER_TARGETS += rocm
-endif
-endif
-
-
-all: runners exe
-
-dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe
-
-dist_%:
-	@$(MAKE) --no-print-directory -f make/Makefile.$* dist
-
-runners: $(RUNNER_TARGETS)
-
-$(RUNNER_TARGETS):
-	@$(MAKE) --no-print-directory -f make/Makefile.$@
-
-exe dist_exe:
-	@$(MAKE) --no-print-directory -f make/Makefile.ollama $@
-
-help-sync apply-patches create-patches sync sync-clean:
-	@$(MAKE) --no-print-directory -f make/Makefile.sync $@
-
-test integration lint:
-	@$(MAKE) --no-print-directory -f make/Makefile.test $@
-
-clean:
-	rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE)
-	go clean -cache
-
-help:
-	@echo "The following make targets will help you build Ollama"
-	@echo ""
-	@echo "	make all   		# (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
-	@echo "	make runners		# Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
-	@echo "	make <runner>		# Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
-	@echo "	make dist		# Build the runners and primary ollama executable for distribution"
-	@echo "	make help-sync 		# Help information on vendor update targets"
-	@echo "	make help-runners 	# Help information on runner targets"
-	@echo ""
-	@echo "The following make targets will help you test Ollama"
-	@echo ""
-	@echo "	make test   		# Run unit tests"
-	@echo "	make integration	# Run integration tests.  You must 'make all' first"
-	@echo "	make lint   		# Run lint and style tests"
-	@echo ""
-	@echo "For more information see 'docs/development.md'"
-	@echo ""
-
-
-help-runners:
-	@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
-	@echo ""
-	@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)'  (Override with CUSTOM_CPU_FLAGS)"
-	@echo ""
-	@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
-	@echo "CUDA_PATH=$(CUDA_PATH)"
-	@echo "	CUDA_11_PATH=$(CUDA_11_PATH)"
-	@echo "	CUDA_11_COMPILER=$(CUDA_11_COMPILER)"
-	@echo "	CUDA_12_PATH=$(CUDA_12_PATH)"
-	@echo "	CUDA_12_COMPILER=$(CUDA_12_COMPILER)"
-	@echo ""
-	@echo "# HIP_PATH sets the location where the ROCm toolkit is present"
-	@echo "HIP_PATH=$(HIP_PATH)"
-	@echo "	HIP_COMPILER=$(HIP_COMPILER)"
-
-.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS)
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
diff --git a/Makefile.sync b/Makefile.sync
new file mode 100644
index 000000000..3001487de
--- /dev/null
+++ b/Makefile.sync
@@ -0,0 +1,56 @@
+UPSTREAM=https://github.com/ggerganov/llama.cpp.git
+WORKDIR=llama/vendor
+FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
+
+.PHONY: help
+help:
+	@echo "Available targets:"
+	@echo "    sync                 Sync with upstream repositories"
+	@echo "    checkout             Checkout upstream repository"
+	@echo "    apply-patches        Apply patches to local repository"
+	@echo "    format-patches       Format patches from local repository"
+	@echo "    clean                Clean local repository"
+	@echo
+	@echo "Example:"
+	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
+
+.PHONY: sync
+sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches
+
+.PHONY: llama/llama.cpp
+llama/llama.cpp: llama/vendor/ apply-patches
+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
+
+.PHONY: ml/backend/ggml/ggml apply-patches
+ml/backend/ggml/ggml: llama/vendor/ggml/ apply-patches
+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
+
+PATCHES=$(wildcard llama/patches/*.patch)
+
+.PHONY: apply-patches
+.NOTPARALLEL:
+apply-patches: $(addsuffix ed, $(PATCHES))
+
+%.patched: %.patch
+	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
+
+.PHONY: checkout
+checkout: $(WORKDIR)
+	git -C $(WORKDIR) fetch
+	git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
+
+$(WORKDIR):
+	git clone $(UPSTREAM) $(WORKDIR)
+
+.PHONE: format-patches
+format-patches: llama/patches
+	git -C $(WORKDIR) format-patch \
+		--no-signature \
+		--no-numbered \
+		--zero-commit \
+		-o $(realpath $<) \
+		$(FETCH_HEAD)
+
+.PHONE: clean
+clean: checkout
+	$(RM) $(addsuffix ed, $(PATCHES))
diff --git a/discover/amd_common.go b/discover/amd_common.go
index 3c6308610..08834b22d 100644
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@@ -9,8 +9,6 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
-
-	"github.com/ollama/ollama/envconfig"
 )
 
 // Determine if the given ROCm lib directory is usable by checking for existence of some glob patterns
@@ -41,13 +39,10 @@ func commonAMDValidateLibDir() (string, error) {
 	// Favor our bundled version
 
 	// Installer payload location if we're running the installed binary
-	exe, err := os.Executable()
-	if err == nil {
-		rocmTargetDir := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
-		if rocmLibUsable(rocmTargetDir) {
-			slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
-			return rocmTargetDir, nil
-		}
+	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
+	if rocmLibUsable(rocmTargetDir) {
+		slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
+		return rocmTargetDir, nil
 	}
 
 	// Prefer explicit HIP env var
diff --git a/discover/amd_linux.go b/discover/amd_linux.go
index ecf91056d..830fa1df6 100644
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -77,8 +77,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
-	depPaths := LibraryDirs()
-	libDir := ""
+	var libDir string
 
 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
@@ -353,9 +352,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				})
 				return nil, err
 			}
-			depPaths = append(depPaths, libDir)
 		}
-		gpuInfo.DependencyPath = depPaths
+		gpuInfo.DependencyPath = []string{libDir}
 
 		if gfxOverride == "" {
 			// Only load supported list once
diff --git a/discover/amd_windows.go b/discover/amd_windows.go
index 9477bedc7..0659d12f8 100644
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -5,7 +5,6 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
-	"os"
 	"path/filepath"
 	"slices"
 	"strconv"
@@ -50,14 +49,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		slog.Info(err.Error())
 		return nil, err
 	}
-	depPaths := LibraryDirs()
+
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
 		err = fmt.Errorf("unable to verify rocm library: %w", err)
 		slog.Warn(err.Error())
 		return nil, err
 	}
-	depPaths = append(depPaths, libDir)
 
 	var supported []string
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
@@ -113,7 +111,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,
 
 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: depPaths,
+				DependencyPath: []string{libDir},
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
@@ -164,9 +162,7 @@ func AMDValidateLibDir() (string, error) {
 	}
 
 	// Installer payload (if we're running from some other location)
-	localAppData := os.Getenv("LOCALAPPDATA")
-	appDir := filepath.Join(localAppData, "Programs", "Ollama")
-	rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
+	rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
 	if rocmLibUsable(rocmTargetDir) {
 		slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
 		return rocmTargetDir, nil
diff --git a/discover/gpu.go b/discover/gpu.go
index 2e83b74f5..ba906a184 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -23,7 +23,6 @@ import (
 
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )
 
 type cudaHandles struct {
@@ -101,15 +100,7 @@ func initCudaHandles() *cudaHandles {
 
 	// Aligned with driver, we can't carry as payloads
 	nvcudaMgmtPatterns := NvcudaGlobs
-
-	if runtime.GOOS == "windows" {
-		localAppData := os.Getenv("LOCALAPPDATA")
-		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
-	}
-	libDirs := LibraryDirs()
-	for _, d := range libDirs {
-		cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(d, CudartMgmtName))
-	}
+	cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(LibOllamaPath, "cuda_v*", CudartMgmtName))
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
 
 	if len(NvmlGlobs) > 0 {
@@ -240,7 +231,7 @@ func GetGPUInfo() GpuInfoList {
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
-		depPaths := LibraryDirs()
+
 		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
@@ -248,11 +239,9 @@ func GetGPUInfo() GpuInfoList {
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
-					memInfo:        mem,
-					Library:        "cpu",
-					Variant:        runners.GetCPUCapability().String(),
-					ID:             "0",
-					DependencyPath: depPaths,
+					memInfo: mem,
+					Library: "cpu",
+					ID:      "0",
 				},
 				CPUs: details,
 			},
@@ -294,17 +283,13 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
-				if depPaths != nil {
-					gpuInfo.DependencyPath = depPaths
-					// Check for variant specific directory
-					if variant != "" {
-						for _, d := range depPaths {
-							if _, err := os.Stat(filepath.Join(d, "cuda_"+variant)); err == nil {
-								// Put the variant directory first in the search path to avoid runtime linking to the wrong library
-								gpuInfo.DependencyPath = append([]string{filepath.Join(d, "cuda_"+variant)}, gpuInfo.DependencyPath...)
-								break
-							}
-						}
+
+				// Start with our bundled libraries
+				if variant != "" {
+					variantPath := filepath.Join(LibOllamaPath, "cuda_"+variant)
+					if _, err := os.Stat(variantPath); err == nil {
+						// Put the variant directory first in the search path to avoid runtime linking to the wrong library
+						gpuInfo.DependencyPath = append([]string{variantPath}, gpuInfo.DependencyPath...)
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
@@ -376,7 +361,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = depPaths
+						gpuInfo.DependencyPath = []string{LibOllamaPath}
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
@@ -512,33 +497,30 @@ func GetGPUInfo() GpuInfoList {
 
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
-	var ldPaths []string
 	gpuLibPaths := []string{}
 	slog.Debug("Searching for GPU library", "name", baseLibName)
 
-	// Start with our bundled libraries
-	patterns := []string{}
-	for _, d := range LibraryDirs() {
-		patterns = append(patterns, filepath.Join(d, baseLibName))
-	}
+	// search our bundled libraries first
+	patterns := []string{filepath.Join(LibOllamaPath, baseLibName)}
 
+	var ldPaths []string
 	switch runtime.GOOS {
 	case "windows":
-		ldPaths = strings.Split(os.Getenv("PATH"), ";")
+		ldPaths = strings.Split(os.Getenv("PATH"), string(os.PathListSeparator))
 	case "linux":
-		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
-	default:
-		return gpuLibPaths
+		ldPaths = strings.Split(os.Getenv("LD_LIBRARY_PATH"), string(os.PathListSeparator))
 	}
 
-	// Then with whatever we find in the PATH/LD_LIBRARY_PATH
-	for _, ldPath := range ldPaths {
-		d, err := filepath.Abs(ldPath)
+	// then search the system's LD_LIBRARY_PATH
+	for _, p := range ldPaths {
+		p, err := filepath.Abs(p)
 		if err != nil {
 			continue
 		}
-		patterns = append(patterns, filepath.Join(d, baseLibName))
+		patterns = append(patterns, filepath.Join(p, baseLibName))
 	}
+
+	// finally, search the default patterns provided by the caller
 	patterns = append(patterns, defaultPatterns...)
 	slog.Debug("gpu library search", "globs", patterns)
 	for _, pattern := range patterns {
@@ -715,28 +697,6 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }
 
-func LibraryDirs() []string {
-	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
-	// This can be simplified once we no longer carry runners as payloads
-	paths := []string{}
-	appExe, err := os.Executable()
-	if err != nil {
-		slog.Warn("failed to lookup executable path", "error", err)
-	} else {
-		appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
-		if _, err := os.Stat(appRelative); err == nil {
-			paths = append(paths, appRelative)
-		}
-	}
-	rDir := runners.Locate()
-	if err != nil {
-		slog.Warn("unable to locate gpu dependency libraries", "error", err)
-	} else {
-		paths = append(paths, filepath.Dir(rDir))
-	}
-	return paths
-}
-
 func GetSystemInfo() SystemInfo {
 	gpus := GetGPUInfo()
 	gpuMutex.Lock()
diff --git a/discover/gpu_darwin.go b/discover/gpu_darwin.go
index 15f8f7996..dd5bf6e27 100644
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -15,7 +15,6 @@ import (
 	"syscall"
 
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )
 
 const (
@@ -28,7 +27,6 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: runners.GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -51,7 +49,6 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: runners.GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
diff --git a/discover/path.go b/discover/path.go
new file mode 100644
index 000000000..a9a6518df
--- /dev/null
+++ b/discover/path.go
@@ -0,0 +1,53 @@
+package discover
+
+import (
+	"os"
+	"path/filepath"
+	"runtime"
+)
+
+// LibPath is a path to lookup dynamic libraries
+// in development it's usually 'build/lib/ollama'
+// in distribution builds it's 'lib/ollama' on Windows
+// '../lib/ollama' on Linux and the executable's directory on macOS
+// note: distribution builds, additional GPU-specific libraries are
+// found in subdirectories of the returned path, such as
+// 'cuda_v11', 'cuda_v12', 'rocm', etc.
+var LibOllamaPath string = func() string {
+	exe, err := os.Executable()
+	if err != nil {
+		return ""
+	}
+
+	exe, err = filepath.EvalSymlinks(exe)
+	if err != nil {
+		return ""
+	}
+
+	libPath := filepath.Dir(exe)
+	switch runtime.GOOS {
+	case "windows":
+		libPath = filepath.Join(filepath.Dir(exe), "lib", "ollama")
+	case "linux":
+		libPath = filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		return ""
+	}
+
+	// build paths for development
+	buildPaths := []string{
+		filepath.Join(filepath.Dir(exe), "build", "lib", "ollama"),
+		filepath.Join(cwd, "build", "lib", "ollama"),
+	}
+
+	for _, p := range buildPaths {
+		if _, err := os.Stat(p); err == nil {
+			return p
+		}
+	}
+
+	return libPath
+}()
diff --git a/discover/types.go b/discover/types.go
index 4568e3b85..c5212d94e 100644
--- a/discover/types.go
+++ b/discover/types.go
@@ -5,7 +5,6 @@ import (
 	"log/slog"
 
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/runners"
 )
 
 type memInfo struct {
@@ -107,7 +106,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != runners.CPUCapabilityNone.String() {
+		if info.Variant != "" {
 			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
diff --git a/docs/development.md b/docs/development.md
index e194dca0b..3e2ed49ba 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -1,165 +1,120 @@
 # Development
 
-Install required tools:
+Install prerequisites:
 
-- go version 1.22 or higher
-- OS specific C/C++ compiler (see below)
-- GNU Make
+- [Go](https://go.dev/doc/install)
+- C/C++ Compiler e.g. Clang on macOS, [TDM-GCC](https://jmeubank.github.io/tdm-gcc/download/) (Windows amd64) or [llvm-mingw](https://github.com/mstorsjo/llvm-mingw) (Windows arm64), GCC/Clang on Linux.
 
-
-## Overview
-
-Ollama uses a mix of Go and C/C++ code to interface with GPUs.  The C/C++ code is compiled with both CGO and GPU library specific compilers.  A set of GNU Makefiles are used to compile the project.  GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary.  The default make target will build the runners and primary Go Ollama application that will run within the repo directory.  Throughout the examples below `-j 5` is suggested for 5 parallel jobs to speed up the build.  You can adjust the job count based on your CPU Core count to reduce build times.  If you want to relocate the built binaries, use the `dist` target and recursively copy the files in `./dist/$OS-$ARCH/` to your desired location. To learn more about the other make targets use `make help`
-
-Once you have built the GPU/CPU runners, you can compile the main application with `go build .` 
-
-### MacOS
-
-[Download Go](https://go.dev/dl/)
-
-```bash
-make -j 5
-```
-
-Now you can run `ollama`:
-
-```bash
-./ollama
-```
-
-#### Xcode 15 warnings
-
-If you are using Xcode newer than version 14, you may see a warning during `go build` about `ld: warning: ignoring duplicate libraries: '-lobjc'` due to Golang issue https://github.com/golang/go/issues/67799 which can be safely ignored.  You can suppress the warning with `export CGO_LDFLAGS="-Wl,-no_warn_duplicate_libraries"`
-
-### Linux
-
-#### Linux CUDA (NVIDIA)
-
-_Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
-
-Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
-development and runtime packages.
-
-Typically the makefile will auto-detect CUDA, however, if your Linux distro
-or installation approach uses alternative paths, you can specify the location by
-overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
-a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)
+Then build and run Ollama from the root directory of the repository:
 
 ```
-make -j 5
+go run . serve
 ```
 
-If both v11 and v12 tookkits are detected, runners for both major versions will be built by default.  You can build just v12 with `make cuda_v12`
+## macOS (Apple Silicon)
 
-#### Older Linux CUDA (NVIDIA)
+macOS Apple Silicon supports Metal which is built-in to the Ollama binary. No additional steps are required.
 
-To support older GPUs with Compute Capability 3.5 or 3.7, you will need to use an older version of the Driver from [Unix Driver Archive](https://www.nvidia.com/en-us/drivers/unix/) (tested with 470) and [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (tested with cuda V11).  When you build Ollama, you will need to set two make variable to adjust the minimum compute capability Ollama supports via `make -j 5 CUDA_ARCHITECTURES="35;37;50;52" EXTRA_GOLDFLAGS="\"-X=github.com/ollama/ollama/discover.CudaComputeMajorMin=3\" \"-X=github.com/ollama/ollama/discover.CudaComputeMinorMin=5\""`.  To find the Compute Capability of your older GPU, refer to [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
+## macOS (Intel)
 
-#### Linux ROCm (AMD)
+Install prerequisites:
 
-_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+- [CMake](https://cmake.org/download/) or `brew install cmake`
 
-Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
-
-Typically the build scripts will auto-detect ROCm, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `HIP_PATH` to the location of the ROCm
-install (typically `/opt/rocm`). You can also customize
-the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)
+Then, configure and build the project:
 
 ```
-make -j 5
+cmake -B build
+cmake --build build
 ```
 
-ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
-
-#### Containerized Linux Build
-
-If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`
-
-### Windows
-
-The following tools are required as a minimal development environment to build CPU inference support.
-
-- Go version 1.22 or higher
-  - https://go.dev/dl/
-- Git
-  - https://git-scm.com/download/win
-- clang with gcc compat and Make.  There are multiple options on how to go about installing these tools on Windows.  We have verified the following, but others may work as well:  
-  - [MSYS2](https://www.msys2.org/)
-    - After installing, from an MSYS2 terminal, run `pacman -S mingw-w64-clang-x86_64-gcc-compat mingw-w64-clang-x86_64-clang make` to install the required tools
-  - Assuming you used the default install prefix for msys2 above, add `C:\msys64\clang64\bin` and `c:\msys64\usr\bin` to your environment variable `PATH` where you will perform the build steps below (e.g. system-wide, account-level, powershell, cmd, etc.)
-
-> [!NOTE]  
-> Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
+Lastly, run Ollama:
 
 ```
-make -j 5
+go run . serve
 ```
 
-#### GPU Support
+## Windows
 
-The GPU tools require the Microsoft native build tools.  To build either CUDA or ROCm, you must first install MSVC via Visual Studio:
+Install prerequisites:
 
-- Make sure to select `Desktop development with C++` as a Workload during the Visual Studio install
-- You must complete the Visual Studio install and run it once **BEFORE** installing CUDA or ROCm for the tools to properly register
-- Add the location of the **64 bit (x64)** compiler (`cl.exe`) to your `PATH`
-- Note: the default Developer Shell may configure the 32 bit (x86) compiler which will lead to build failures.  Ollama requires a 64 bit toolchain.
+- [CMake](https://cmake.org/download/)
+- [Visual Studio 2022](https://visualstudio.microsoft.com/downloads/) including the Native Desktop Workload
+- (Optional) AMD GPU support
+    - [ROCm](https://rocm.github.io/install.html)
+    - [Ninja](https://github.com/ninja-build/ninja/releases)
+- (Optional) NVIDIA GPU support
+    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_network)
 
-#### Windows CUDA (NVIDIA)
+> [!IMPORTANT]
+> Ensure prerequisites are in `PATH` before running CMake.
 
-In addition to the common Windows development tools and MSVC described above:
+> [!IMPORTANT]
+> ROCm is not compatible with Visual Studio CMake generators. Use `-GNinja` when configuring the project.
 
-- [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html)
+> [!IMPORTANT]
+> CUDA is only compatible with Visual Studio CMake generators.
 
-#### Windows ROCm (AMD Radeon)
-
-In addition to the common Windows development tools and MSVC described above:
-
-- [AMD HIP](https://www.amd.com/en/developer/resources/rocm-hub/hip-sdk.html)
-
-#### Windows arm64
-
-The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
-
-```powershell
-import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
-Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
-```
-
-You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
-
-Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
+Then, configure and build the project:
 
 ```
-pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
+cmake -B build
+cmake --build build --config Release
 ```
 
-You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
-
-
-## Advanced CPU Vector Settings
-
-On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
-
-To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
-
-To build without any vector flags:
+Lastly, run Ollama:
 
 ```
-make CUSTOM_CPU_FLAGS=""
+go run . serve
 ```
 
-To build with both AVX and AVX2:
-```
-make CUSTOM_CPU_FLAGS=avx,avx2
-```
+## Windows (ARM)
 
-To build with AVX512 features turned on:
+Windows ARM does not support additional acceleration libraries at this time.
+
+## Linux
+
+Install prerequisites:
+
+- [CMake](https://cmake.org/download/) or `sudo apt install cmake` or `sudo dnf install cmake`
+- (Optional) AMD GPU support
+    - [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html)
+- (Optional) NVIDIA GPU support
+    - [CUDA SDK](https://developer.nvidia.com/cuda-downloads)
+
+> [!IMPORTANT]
+> Ensure prerequisites are in `PATH` before running CMake.
+
+
+Then, configure and build the project:
 
 ```
-make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
+cmake -B build
+cmake --build build
 ```
 
-> [!NOTE]  
-> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
+Lastly, run Ollama:
+
+```
+go run . serve
+```
+
+## Docker
+
+```
+docker build .
+```
+
+### ROCm
+
+```
+docker build --build-arg FLAVOR=rocm .
+```
+
+## Running tests
+
+To run tests, use `go test`:
+
+```
+go test ./...
+```
diff --git a/envconfig/config.go b/envconfig/config.go
index c10095a64..0ca3b64cd 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -288,12 +288,3 @@ func Values() map[string]string {
 func Var(key string) string {
 	return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
 }
-
-// On windows, we keep the binary at the top directory, but
-// other platforms use a "bin" directory, so this returns ".."
-func LibRelativeToExe() string {
-	if runtime.GOOS == "windows" {
-		return "."
-	}
-	return ".."
-}
diff --git a/go.mod b/go.mod
index 1a1fdb408..1c99c0946 100644
--- a/go.mod
+++ b/go.mod
@@ -17,12 +17,14 @@ require (
 require (
 	github.com/agnivade/levenshtein v1.1.1
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
+	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
+	gonum.org/v1/gonum v0.15.0
 )
 
 require (
@@ -42,7 +44,6 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )
diff --git a/go.sum b/go.sum
index 6a2c91896..8eb8d84ab 100644
--- a/go.sum
+++ b/go.sum
@@ -42,6 +42,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
+github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
+github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
 github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
diff --git a/llama/README.md b/llama/README.md
index 3b6b20672..f6a397258 100644
--- a/llama/README.md
+++ b/llama/README.md
@@ -1,158 +1,53 @@
 # `llama`
 
-This package integrates the [llama.cpp](https://github.com/ggerganov/llama.cpp) library as a Go package and makes it easy to build it with tags for different CPU and GPU processors.
-
-Supported:
-
-- [x] CPU
-- [x] avx, avx2
-- [x] macOS Metal
-- [x] Windows CUDA
-- [x] Windows ROCm
-- [x] Linux CUDA
-- [x] Linux ROCm
-- [x] Llava
-
-Extra build steps are required for CUDA and ROCm on Windows since `nvcc` and `hipcc` both require using msvc as the host compiler. For these shared libraries are created:
-
-- `ggml_cuda.dll` on Windows or `ggml_cuda.so` on Linux
-- `ggml_hipblas.dll` on Windows or `ggml_hipblas.so` on Linux
-
-> Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
-
-## Building
-
-```
-go build .
-```
-
-### AVX
-
-```shell
-go build -tags avx .
-```
-
-### AVX2
-
-```shell
-# go doesn't recognize `-mfma` as a valid compiler flag
-# see https://github.com/golang/go/issues/17895
-go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c"
-go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
-go build -tags=avx,avx2 .
-```
-
-## Linux
-
-### CUDA
-
-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
-
-```shell
-make ggml_cuda.so
-go build -tags avx,cuda .
-```
-
-### ROCm
-
-Install [ROCm](https://rocm.docs.amd.com/en/latest/).
-
-```shell
-make ggml_hipblas.so
-go build -tags avx,rocm .
-```
-
-## Windows
-
-Download [w64devkit](https://github.com/skeeto/w64devkit/releases/latest) for a simple MinGW development environment.
-
-### CUDA
-
-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build the cuda code:
-
-```shell
-make ggml_cuda.dll
-go build -tags avx,cuda .
-```
-
-### ROCm
-
-Install [ROCm](https://rocm.docs.amd.com/en/latest/).
-
-```shell
-make ggml_hipblas.dll
-go build -tags avx,rocm .
-```
-
-## Building runners
-
-```shell
-# build all runners for this platform
-make -j
-```
+This package provides Go bindings to [llama.cpp](https://github.com/ggerganov/llama.cpp).
 
 ## Vendoring
 
-Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
+Ollama vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/llama.cpp/tree/master/ggml/src). While we generally strive to contribute changes back upstream to avoid drift, we carry a small set of patches which are applied to the tracking commit.
 
 If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
 
 ```
-make apply-patches
+make -f Makefile.sync apply-patches
 ```
 
 ### Updating Base Commit
 
 **Pin to new base commit**
 
-To update to a newer base commit, select the upstream git tag or commit and update `llama/vendoring`
-
-#### Applying patches
+To change the base commit, update `FETCH_HEAD` in Makefile.sync.
 
 When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
 
 Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
 
 ```
-make apply-patches
+make -f Makefile.sync apply-patches
 ```
 
-If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
+If there are conflicts, you will see an error message. Resolve the conflicts in `./vendor/`, and continue the patch series with `git am --continue` and rerun `make -f Makefile.sync apply-patches`. Repeat until all patches are successfully applied.
+
+Once all patches are applied, commit the changes to the tracking repository.
 
 ```
-make create-patches sync
+make -f Makefile.sync format-patches sync
 ```
 
-Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
-
 ### Generating Patches
 
 When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
 
 ```
-make apply-patches
+make -f Makefile.sync clean apply-patches
 ```
 
-Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
-
-```
-make sync
-make -j 8
-go build .
-```
-
-> [!IMPORTANT]
-> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
-
 Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
 
 ```
-make create-patches
+make -f Makefile.sync format-patches
 ```
 
-> [!IMPORTANT]
-> Once you have completed this step, it is safe to run `apply-patches` since your change is preserved in the patches.
-
 In your `./vendor/` directory, create a branch, and cherry-pick the new commit to that branch, then submit a PR upstream to llama.cpp.
 
 Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.
diff --git a/llama/amx.h b/llama/amx.h
deleted file mode 100644
index 5b64b8bd8..000000000
--- a/llama/amx.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ggml-backend.h"
-#include "ggml-cpu-impl.h"
-
-// GGML internal header
-
-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-#endif
diff --git a/llama/ggml-blas.h b/llama/ggml-blas.h
deleted file mode 100644
index f5fb9de21..000000000
--- a/llama/ggml-blas.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-// backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
-
-GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
-
-// number of threads used for conversion to float
-// for openblas and blis, this will also set the number of threads used for blas operations
-GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
-
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/llama/ggml-cpu-aarch64.h b/llama/ggml-cpu-aarch64.h
deleted file mode 100644
index 14320735c..000000000
--- a/llama/ggml-cpu-aarch64.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml-cpu-traits.h"
-#include "ggml.h"
-
-// GGML internal header
-
-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
diff --git a/llama/ggml-cpu-traits.h b/llama/ggml-cpu-traits.h
deleted file mode 100644
index dcd7855fd..000000000
--- a/llama/ggml-cpu-traits.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "ggml-backend-impl.h"
-#include "ggml-cpu-impl.h"
-#include "ggml.h"
-
-#ifdef __cplusplus
-#    include <vector>
-extern "C" {
-#endif
-
-// return true if op part of extra "accelerator"
-bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
-bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
-
-#ifdef __cplusplus
-}
-
-namespace ggml::cpu {
-// register in tensor->extra
-class tensor_traits {
-  public:
-    virtual ~tensor_traits();
-    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
-    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
-};
-
-class extra_buffer_type {
-  public:
-    virtual ~extra_buffer_type();
-    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
-    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
-};
-}  // namespace ggml::cpu
-
-// implemented in ggml-cpu.cpp.
-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
-
-#endif
diff --git a/llama/ggml-cuda/acc.cuh b/llama/ggml-cuda/acc.cuh
deleted file mode 100644
index 5c12d9066..000000000
--- a/llama/ggml-cuda/acc.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ACC_BLOCK_SIZE 256
-
-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/arange.cu b/llama/ggml-cuda/arange.cu
deleted file mode 100644
index 3b67b3b5f..000000000
--- a/llama/ggml-cuda/arange.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arange.cuh"
-
-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    dst[nidx] = start + step * nidx;
-}
-
-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
-}
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    float start;
-    float stop;
-    float step;
-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
-}
diff --git a/llama/ggml-cuda/arange.cuh b/llama/ggml-cuda/arange.cuh
deleted file mode 100644
index 16201546b..000000000
--- a/llama/ggml-cuda/arange.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ARANGE_BLOCK_SIZE 256
-
-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/argmax.cuh b/llama/ggml-cuda/argmax.cuh
deleted file mode 100644
index 805a90d8c..000000000
--- a/llama/ggml-cuda/argmax.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/argsort.cuh b/llama/ggml-cuda/argsort.cuh
deleted file mode 100644
index 0d8427bb1..000000000
--- a/llama/ggml-cuda/argsort.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/binbcast.cuh b/llama/ggml-cuda/binbcast.cuh
deleted file mode 100644
index 3acee0d07..000000000
--- a/llama/ggml-cuda/binbcast.cuh
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/clamp.cu b/llama/ggml-cuda/clamp.cu
deleted file mode 100644
index 2df1076ca..000000000
--- a/llama/ggml-cuda/clamp.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "clamp.cuh"
-
-static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
-}
-
-
-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
-}
diff --git a/llama/ggml-cuda/clamp.cuh b/llama/ggml-cuda/clamp.cuh
deleted file mode 100644
index 3f74a8807..000000000
--- a/llama/ggml-cuda/clamp.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CLAMP_BLOCK_SIZE 256
-
-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/concat.cuh b/llama/ggml-cuda/concat.cuh
deleted file mode 100644
index ba2b67ec1..000000000
--- a/llama/ggml-cuda/concat.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CONCAT_BLOCK_SIZE 256
-
-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/conv-transpose-1d.cuh b/llama/ggml-cuda/conv-transpose-1d.cuh
deleted file mode 100644
index 53c3beefd..000000000
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
-
-void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/convert.cuh b/llama/ggml-cuda/convert.cuh
deleted file mode 100644
index 27f949e2a..000000000
--- a/llama/ggml-cuda/convert.cuh
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-
-template<typename T>
-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
-
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<half> to_fp16_cuda_t;
-
-to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
-
-to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
diff --git a/llama/ggml-cuda/count-equal.cuh b/llama/ggml-cuda/count-equal.cuh
deleted file mode 100644
index 922c6288c..000000000
--- a/llama/ggml-cuda/count-equal.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
-
-void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/cpy.cuh b/llama/ggml-cuda/cpy.cuh
deleted file mode 100644
index 79496c4cc..000000000
--- a/llama/ggml-cuda/cpy.cuh
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CPY_BLOCK_SIZE 64
-
-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
-
-void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
diff --git a/llama/ggml-cuda/cross-entropy-loss.cuh b/llama/ggml-cuda/cross-entropy-loss.cuh
deleted file mode 100644
index e816b8df6..000000000
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
-
-void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/diagmask.cuh b/llama/ggml-cuda/diagmask.cuh
deleted file mode 100644
index 76162837f..000000000
--- a/llama/ggml-cuda/diagmask.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-
-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/fattn-tile-f16.cuh b/llama/ggml-cuda/fattn-tile-f16.cuh
deleted file mode 100644
index 4a3965ed3..000000000
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/fattn-tile-f32.cuh b/llama/ggml-cuda/fattn-tile-f32.cuh
deleted file mode 100644
index 8a5eef471..000000000
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/fattn.cuh b/llama/ggml-cuda/fattn.cuh
deleted file mode 100644
index 6947118e1..000000000
--- a/llama/ggml-cuda/fattn.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/getrows.cuh b/llama/ggml-cuda/getrows.cuh
deleted file mode 100644
index bbbf482d3..000000000
--- a/llama/ggml-cuda/getrows.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-
-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/im2col.cuh b/llama/ggml-cuda/im2col.cuh
deleted file mode 100644
index 2c64c16b1..000000000
--- a/llama/ggml-cuda/im2col.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_IM2COL_BLOCK_SIZE 256
-
-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/mmv.cuh b/llama/ggml-cuda/mmv.cuh
deleted file mode 100644
index fcfc8ea46..000000000
--- a/llama/ggml-cuda/mmv.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
-#define MMV_MAX_ROWS 512
-
-void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-void ggml_cuda_op_mul_mat_vec(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
diff --git a/llama/ggml-cuda/mmvq.cuh b/llama/ggml-cuda/mmvq.cuh
deleted file mode 100644
index ae18ae315..000000000
--- a/llama/ggml-cuda/mmvq.cuh
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
-
-void ggml_cuda_op_mul_mat_vec_q(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream);
diff --git a/llama/ggml-cuda/norm.cuh b/llama/ggml-cuda/norm.cuh
deleted file mode 100644
index 0902f23ac..000000000
--- a/llama/ggml-cuda/norm.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/opt-step-adamw.cuh b/llama/ggml-cuda/opt-step-adamw.cuh
deleted file mode 100644
index b956bf93a..000000000
--- a/llama/ggml-cuda/opt-step-adamw.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
-
-void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/out-prod.cuh b/llama/ggml-cuda/out-prod.cuh
deleted file mode 100644
index 4631cd65a..000000000
--- a/llama/ggml-cuda/out-prod.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/pad.cuh b/llama/ggml-cuda/pad.cuh
deleted file mode 100644
index 9c23680dc..000000000
--- a/llama/ggml-cuda/pad.cuh
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_PAD_BLOCK_SIZE 256
-
-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/pool2d.cuh b/llama/ggml-cuda/pool2d.cuh
deleted file mode 100644
index 9c0045f87..000000000
--- a/llama/ggml-cuda/pool2d.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_POOL2D_BLOCK_SIZE 256
-
-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/quantize.cuh b/llama/ggml-cuda/quantize.cuh
deleted file mode 100644
index ee8e2a52b..000000000
--- a/llama/ggml-cuda/quantize.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "common.cuh"
-#include "mmq.cuh"
-
-#include <cstdint>
-
-#define CUDA_QUANTIZE_BLOCK_SIZE     256
-#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
-
-static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
-static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
-
-typedef void (*quantize_cuda_t)(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
-
-void quantize_row_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
-
-void quantize_mmq_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
diff --git a/llama/ggml-cuda/rope.cuh b/llama/ggml-cuda/rope.cuh
deleted file mode 100644
index cd5140ce0..000000000
--- a/llama/ggml-cuda/rope.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_ROPE_BLOCK_SIZE 256
-
-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/scale.cu b/llama/ggml-cuda/scale.cu
deleted file mode 100644
index b3b38cdf3..000000000
--- a/llama/ggml-cuda/scale.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "scale.cuh"
-
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = scale * x[i];
-}
-
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
-}
-
-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float scale;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
-}
diff --git a/llama/ggml-cuda/scale.cuh b/llama/ggml-cuda/scale.cuh
deleted file mode 100644
index ae2ec5af5..000000000
--- a/llama/ggml-cuda/scale.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_SCALE_BLOCK_SIZE 256
-
-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/softmax.cuh b/llama/ggml-cuda/softmax.cuh
deleted file mode 100644
index 85459e24e..000000000
--- a/llama/ggml-cuda/softmax.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-
-void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/sum.cuh b/llama/ggml-cuda/sum.cuh
deleted file mode 100644
index 6883be872..000000000
--- a/llama/ggml-cuda/sum.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
-
-void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/sumrows.cu b/llama/ggml-cuda/sumrows.cu
deleted file mode 100644
index fbd3cd874..000000000
--- a/llama/ggml-cuda/sumrows.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "sumrows.cuh"
-
-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int col = threadIdx.x;
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += blockDim.x) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
-void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    const dim3 block_dims(WARP_SIZE, 1, 1);
-    const dim3 block_nums(nrows, 1, 1);
-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
-}
-
-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const float * src0_d = (const float *)src0->data;
-    float * dst_d = (float *)dst->data;
-    cudaStream_t stream = ctx.stream();
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
-}
diff --git a/llama/ggml-cuda/sumrows.cuh b/llama/ggml-cuda/sumrows.cuh
deleted file mode 100644
index 204384f51..000000000
--- a/llama/ggml-cuda/sumrows.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
-
-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
deleted file mode 100644
index 48cdc8f40..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
deleted file mode 100644
index 6aeab0bac..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
deleted file mode 100644
index 2d98ef1a9..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
deleted file mode 100644
index 7fe280e05..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
deleted file mode 100644
index 9835cbfaa..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
deleted file mode 100644
index 45ffa2a8f..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
deleted file mode 100644
index 592287a86..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
deleted file mode 100644
index fe080a734..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
deleted file mode 100644
index 0580444e6..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
deleted file mode 100644
index 5b2650d8a..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
deleted file mode 100644
index 886ba3956..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
deleted file mode 100644
index 789757a80..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
deleted file mode 100644
index a4bfe23ff..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
deleted file mode 100644
index eab22f0d9..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
deleted file mode 100644
index 3301160fb..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
deleted file mode 100644
index aa37c412e..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
deleted file mode 100644
index a2dd8d866..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
deleted file mode 100644
index 709c2de08..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
deleted file mode 100644
index 3279dad9c..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
deleted file mode 100644
index 4e112e133..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
deleted file mode 100644
index 8662359bd..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
deleted file mode 100644
index bc3c70614..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
deleted file mode 100644
index 027c6d944..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
deleted file mode 100644
index 543346290..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
deleted file mode 100644
index 9cdcd1b39..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
deleted file mode 100644
index 258e08b29..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
deleted file mode 100644
index 7c41007a3..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
deleted file mode 100644
index 0296737fe..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
deleted file mode 100644
index f9fdc1976..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
deleted file mode 100644
index 518c67255..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
deleted file mode 100644
index dfb36938d..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
deleted file mode 100644
index 4ae015114..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
deleted file mode 100644
index a69a7acb5..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
deleted file mode 100644
index a46aab8a8..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
deleted file mode 100644
index 3fe4f970a..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
deleted file mode 100644
index 933a5dd7a..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
deleted file mode 100644
index b051c7d1a..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
deleted file mode 100644
index 3a90aba7e..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
deleted file mode 100644
index 3ddad858a..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
deleted file mode 100644
index df3ce0a3c..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
deleted file mode 100644
index 49d2666a4..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
deleted file mode 100644
index 531c87c22..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
deleted file mode 100644
index e747f6e7d..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f16.cuh"
-
-DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
deleted file mode 100644
index d6097d1cb..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
deleted file mode 100644
index a6bda11fd..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
deleted file mode 100644
index 800ea14f6..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
deleted file mode 100644
index b3bad6b01..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
deleted file mode 100644
index 6a7127ddf..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
deleted file mode 100644
index 62351c23b..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
deleted file mode 100644
index 1b35f1688..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
deleted file mode 100644
index 5c6256810..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
deleted file mode 100644
index 6f70b7404..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
deleted file mode 100644
index d91c6f920..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
deleted file mode 100644
index d206889db..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
deleted file mode 100644
index ae104a61e..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
deleted file mode 100644
index ab2c66bec..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
deleted file mode 100644
index 4b55d39f3..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
deleted file mode 100644
index 1c1065ff3..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
deleted file mode 100644
index b973d1617..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
deleted file mode 100644
index 9b3999e89..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
deleted file mode 100644
index fc7fde303..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
deleted file mode 100644
index b1f482722..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
deleted file mode 100644
index b854659a3..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
deleted file mode 100644
index 35db0d6d6..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
deleted file mode 100644
index cc76b0fb9..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
deleted file mode 100644
index ff9e76dd6..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
deleted file mode 100644
index 4b031d981..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
deleted file mode 100644
index b99bab1e7..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
deleted file mode 100644
index 22e2e6db2..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
deleted file mode 100644
index 95c1984e5..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
deleted file mode 100644
index 65307d393..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
deleted file mode 100644
index ae0ec146b..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
deleted file mode 100644
index 1f420c1d9..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
deleted file mode 100644
index 1d445af3d..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
deleted file mode 100644
index b3a951dc7..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
deleted file mode 100644
index 804c30b20..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
deleted file mode 100644
index 432928a20..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
deleted file mode 100644
index 409f81b06..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
deleted file mode 100644
index 032dab7ff..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
deleted file mode 100644
index 00014a4f3..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
deleted file mode 100644
index 324572636..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
deleted file mode 100644
index e7d49c270..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
deleted file mode 100644
index 8d732548b..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
deleted file mode 100644
index a8e257641..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
deleted file mode 100644
index dabbcd237..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
deleted file mode 100644
index cfbae911d..000000000
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-vec-f32.cuh"
-
-DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
deleted file mode 100644
index b1bdc1e95..000000000
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-wmma-f16.cuh"
-
-DECL_FATTN_WMMA_F16_CASE(64, 16, float);
-DECL_FATTN_WMMA_F16_CASE(80, 16, float);
-DECL_FATTN_WMMA_F16_CASE(96, 16, float);
-DECL_FATTN_WMMA_F16_CASE(112, 16, float);
-DECL_FATTN_WMMA_F16_CASE(128, 16, float);
-DECL_FATTN_WMMA_F16_CASE(256, 16, float);
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
deleted file mode 100644
index 3151d9d67..000000000
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-wmma-f16.cuh"
-
-DECL_FATTN_WMMA_F16_CASE(64, 32, float);
-DECL_FATTN_WMMA_F16_CASE(80, 32, float);
-DECL_FATTN_WMMA_F16_CASE(96, 32, float);
-DECL_FATTN_WMMA_F16_CASE(112, 32, float);
-DECL_FATTN_WMMA_F16_CASE(128, 32, float);
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
deleted file mode 100644
index eea23df92..000000000
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-wmma-f16.cuh"
-
-DECL_FATTN_WMMA_F16_CASE(64, 16, half);
-DECL_FATTN_WMMA_F16_CASE(80, 16, half);
-DECL_FATTN_WMMA_F16_CASE(96, 16, half);
-DECL_FATTN_WMMA_F16_CASE(112, 16, half);
-DECL_FATTN_WMMA_F16_CASE(128, 16, half);
-DECL_FATTN_WMMA_F16_CASE(256, 16, half);
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
deleted file mode 100644
index 70ba3a53b..000000000
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-wmma-f16.cuh"
-
-DECL_FATTN_WMMA_F16_CASE(64, 32, half);
-DECL_FATTN_WMMA_F16_CASE(80, 32, half);
-DECL_FATTN_WMMA_F16_CASE(96, 32, half);
-DECL_FATTN_WMMA_F16_CASE(112, 32, half);
-DECL_FATTN_WMMA_F16_CASE(128, 32, half);
-DECL_FATTN_WMMA_F16_CASE(256, 32, half);
diff --git a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
deleted file mode 100644
index 3a8261ab0..000000000
--- a/llama/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-wmma-f16.cuh"
-
-DECL_FATTN_WMMA_F16_CASE(64, 8, half);
-DECL_FATTN_WMMA_F16_CASE(96, 8, half);
-DECL_FATTN_WMMA_F16_CASE(128, 8, half);
-DECL_FATTN_WMMA_F16_CASE(256, 8, half);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
deleted file mode 100644
index f39436687..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
deleted file mode 100644
index 086ab539f..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
deleted file mode 100644
index 6af7aa320..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
deleted file mode 100644
index fc771442a..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
deleted file mode 100644
index 5ba22c06d..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
deleted file mode 100644
index 647be438b..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
deleted file mode 100644
index b8263fa36..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu b/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
deleted file mode 100644
index 41986b9d6..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
deleted file mode 100644
index 023aec760..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q2_k.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q2_K);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
deleted file mode 100644
index f8bba904d..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q3_k.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q3_K);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
deleted file mode 100644
index 425d7a613..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q4_0);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
deleted file mode 100644
index 91bafb73f..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q4_1);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
deleted file mode 100644
index a0ad396c1..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q4_k.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q4_K);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
deleted file mode 100644
index dc1cbd434..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q5_0);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
deleted file mode 100644
index cc70a445c..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_1.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q5_1);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
deleted file mode 100644
index 3ff67b9f1..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q5_k.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q5_K);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
deleted file mode 100644
index 1d1ffee9f..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q6_k.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q6_K);
diff --git a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
deleted file mode 100644
index 1a7e0865d..000000000
--- a/llama/ggml-cuda/template-instances/mmq-instance-q8_0.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../mmq.cuh"
-
-DECL_MMQ_CASE(GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/tsembd.cuh b/llama/ggml-cuda/tsembd.cuh
deleted file mode 100644
index 629586504..000000000
--- a/llama/ggml-cuda/tsembd.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
-
-void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/upscale.cuh b/llama/ggml-cuda/upscale.cuh
deleted file mode 100644
index d8bb2ec8d..000000000
--- a/llama/ggml-cuda/upscale.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_UPSCALE_BLOCK_SIZE 256
-
-void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/vendors/cuda.h b/llama/ggml-cuda/vendors/cuda.h
deleted file mode 100644
index e309dd3f1..000000000
--- a/llama/ggml-cuda/vendors/cuda.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <cublas_v2.h>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
-#if CUDART_VERSION < 11020
-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
diff --git a/llama/ggml-cuda/wkv6.cuh b/llama/ggml-cuda/wkv6.cuh
deleted file mode 100644
index 270272878..000000000
--- a/llama/ggml-cuda/wkv6.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "common.cuh"
-
-#define CUDA_WKV_BLOCK_SIZE 64
-
-void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-threading.cpp b/llama/ggml-threading.cpp
deleted file mode 100644
index 7559b3366..000000000
--- a/llama/ggml-threading.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ggml-threading.h"
-#include <mutex>
-
-std::mutex ggml_critical_section_mutex;
-
-void ggml_critical_section_start() {
-    ggml_critical_section_mutex.lock();
-}
-
-void ggml_critical_section_end(void) {
-    ggml_critical_section_mutex.unlock();
-}
diff --git a/llama/ggml-threading.h b/llama/ggml-threading.h
deleted file mode 100644
index fe2ce3679..000000000
--- a/llama/ggml-threading.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-GGML_API void ggml_critical_section_start(void);
-GGML_API void ggml_critical_section_end(void);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/llama/json-schema-to-grammar.h b/llama/json-schema-to-grammar.h
deleted file mode 100644
index 39b451cab..000000000
--- a/llama/json-schema-to-grammar.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
-
-std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
diff --git a/llama/llama-cparams.cpp b/llama/llama-cparams.cpp
deleted file mode 100644
index 5a5d14cb0..000000000
--- a/llama/llama-cparams.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "llama-cparams.h"
diff --git a/llama/llama-cparams.h b/llama/llama-cparams.h
deleted file mode 100644
index 74fdb5c54..000000000
--- a/llama/llama-cparams.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "llama.h"
-
-#include <cstdint>
-
-struct llama_cparams {
-    uint32_t n_ctx;           // context size used during inference
-    uint32_t n_batch;
-    uint32_t n_ubatch;
-    uint32_t n_seq_max;
-    int      n_threads;       // number of threads to use for generation
-    int      n_threads_batch; // number of threads to use for batch processing
-
-    float rope_freq_base;
-    float rope_freq_scale;
-
-    uint32_t n_ctx_orig_yarn;
-    // These hyperparameters are not exposed in GGUF, because all
-    // existing YaRN models use the same values for them.
-    float yarn_ext_factor;
-    float yarn_attn_factor;
-    float yarn_beta_fast;
-    float yarn_beta_slow;
-    float defrag_thold;
-
-    bool embeddings;
-    bool causal_attn;
-    bool offload_kqv;
-    bool flash_attn;
-    bool no_perf;
-    bool cross_attn;
-
-    enum llama_pooling_type pooling_type;
-
-    ggml_backend_sched_eval_callback cb_eval;
-    void * cb_eval_user_data;
-};
diff --git a/llama/llama-cpp.h b/llama/llama-cpp.h
deleted file mode 100644
index a0b7beb43..000000000
--- a/llama/llama-cpp.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#ifndef __cplusplus
-#error "This header is for C++ only"
-#endif
-
-#include <memory>
-
-#include "llama.h"
-
-struct llama_model_deleter {
-    void operator()(llama_model * model) { llama_free_model(model); }
-};
-
-struct llama_context_deleter {
-    void operator()(llama_context * context) { llama_free(context); }
-};
-
-struct llama_sampler_deleter {
-    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
-};
-
-struct llama_lora_adapter_deleter {
-    void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
-};
-
-typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
-typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
-typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
-typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
diff --git a/llama/llama-quant.h b/llama/llama-quant.h
deleted file mode 100644
index e60fc6278..000000000
--- a/llama/llama-quant.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
diff --git a/llama/llama.cpp/.rsync-filter b/llama/llama.cpp/.rsync-filter
new file mode 100644
index 000000000..186e1c12e
--- /dev/null
+++ b/llama/llama.cpp/.rsync-filter
@@ -0,0 +1,22 @@
+protect **/*.go
+include common/
+include common/base64.*
+include common/common.*
+include common/json-schema-to-grammar.*
+include common/json.*
+include common/log.*
+include common/sampling.*
+include common/stb_image.*
+include include/
+include include/llama.*
+include include/llama-*.*
+include examples/
+include examples/llava/
+include examples/llava/clip.*
+include examples/llava/llava.*
+include src/
+include src/llama.*
+include src/llama-*.*
+include src/unicode-data.*
+include src/unicode.*
+exclude *
diff --git a/llama/llama.cpp/LICENSE b/llama/llama.cpp/LICENSE
new file mode 100644
index 000000000..acb96ce78
--- /dev/null
+++ b/llama/llama.cpp/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023-2024 The ggml authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/llama/base64.hpp b/llama/llama.cpp/common/base64.hpp
similarity index 100%
rename from llama/base64.hpp
rename to llama/llama.cpp/common/base64.hpp
diff --git a/llama/common.cpp b/llama/llama.cpp/common/common.cpp
similarity index 98%
rename from llama/common.cpp
rename to llama/llama.cpp/common/common.cpp
index 132de88aa..4bb140ee2 100644
--- a/llama/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
diff --git a/llama/llama.cpp/common/common.go b/llama/llama.cpp/common/common.go
new file mode 100644
index 000000000..ebbb738f2
--- /dev/null
+++ b/llama/llama.cpp/common/common.go
@@ -0,0 +1,6 @@
+package common
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo CPPFLAGS: -I${SRCDIR}/../include
+// #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
+import "C"
diff --git a/llama/common.h b/llama/llama.cpp/common/common.h
similarity index 95%
rename from llama/common.h
rename to llama/llama.cpp/common/common.h
index db931490c..0d452cf0f 100644
--- a/llama/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // Various helper functions and utilities
 
 #pragma once
diff --git a/llama/json-schema-to-grammar.cpp b/llama/llama.cpp/common/json-schema-to-grammar.cpp
similarity index 97%
rename from llama/json-schema-to-grammar.cpp
rename to llama/llama.cpp/common/json-schema-to-grammar.cpp
index cc870f9f3..2a8dbd22d 100644
--- a/llama/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "json-schema-to-grammar.h"
 #include <algorithm>
 #include <fstream>
diff --git a/llama/llama.cpp/common/json-schema-to-grammar.h b/llama/llama.cpp/common/json-schema-to-grammar.h
new file mode 100644
index 000000000..41623b346
--- /dev/null
+++ b/llama/llama.cpp/common/json-schema-to-grammar.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
+
+std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
diff --git a/llama/json.hpp b/llama/llama.cpp/common/json.hpp
similarity index 100%
rename from llama/json.hpp
rename to llama/llama.cpp/common/json.hpp
diff --git a/llama/log.cpp b/llama/llama.cpp/common/log.cpp
similarity index 89%
rename from llama/log.cpp
rename to llama/llama.cpp/common/log.cpp
index 959f353ad..04c7c0ed1 100644
--- a/llama/log.cpp
+++ b/llama/llama.cpp/common/log.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "log.h"
 
 #include <condition_variable>
diff --git a/llama/log.h b/llama/llama.cpp/common/log.h
similarity index 77%
rename from llama/log.h
rename to llama/llama.cpp/common/log.h
index 14deeb15c..66605cc69 100644
--- a/llama/log.h
+++ b/llama/llama.cpp/common/log.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h" // for ggml_log_level
diff --git a/llama/sampling.cpp b/llama/llama.cpp/common/sampling.cpp
similarity index 93%
rename from llama/sampling.cpp
rename to llama/llama.cpp/common/sampling.cpp
index b4b72e281..e83a971c7 100644
--- a/llama/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "sampling.h"
 
 #include "common.h"
diff --git a/llama/sampling.h b/llama/llama.cpp/common/sampling.h
similarity index 78%
rename from llama/sampling.h
rename to llama/llama.cpp/common/sampling.h
index 58f409036..348911b18 100644
--- a/llama/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/stb_image.h b/llama/llama.cpp/common/stb_image.h
similarity index 100%
rename from llama/stb_image.h
rename to llama/llama.cpp/common/stb_image.h
diff --git a/llama/clip.cpp b/llama/llama.cpp/examples/llava/clip.cpp
similarity index 98%
rename from llama/clip.cpp
rename to llama/llama.cpp/examples/llava/clip.cpp
index d8cb50938..718052e16 100644
--- a/llama/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // NOTE: This is modified from clip.cpp only for LLaVA,
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
diff --git a/llama/clip.h b/llama/llama.cpp/examples/llava/clip.h
similarity index 74%
rename from llama/clip.h
rename to llama/llama.cpp/examples/llava/clip.h
index 42f24bd6c..1603edd26 100644
--- a/llama/clip.h
+++ b/llama/llama.cpp/examples/llava/clip.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef CLIP_H
 #define CLIP_H
 
diff --git a/llama/llava.cpp b/llama/llama.cpp/examples/llava/llava.cpp
similarity index 95%
rename from llama/llava.cpp
rename to llama/llama.cpp/examples/llava/llava.cpp
index 15393e2d9..0f0f3f623 100644
--- a/llama/llava.cpp
+++ b/llama/llama.cpp/examples/llava/llava.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "clip.h"
 #include "llava.h"
 
diff --git a/llama/llama.cpp/examples/llava/llava.go b/llama/llama.cpp/examples/llava/llava.go
new file mode 100644
index 000000000..37b031cb7
--- /dev/null
+++ b/llama/llama.cpp/examples/llava/llava.go
@@ -0,0 +1,6 @@
+package llava
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
+// #cgo CPPFLAGS: -I${SRCDIR}/../../../../ml/backend/ggml/ggml/include
+import "C"
diff --git a/llama/llava.h b/llama/llama.cpp/examples/llava/llava.h
similarity index 59%
rename from llama/llava.h
rename to llama/llama.cpp/examples/llava/llava.h
index 7e8e501f0..b6feb3027 100644
--- a/llama/llava.h
+++ b/llama/llama.cpp/examples/llava/llava.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef LLAVA_H
 #define LLAVA_H
 
diff --git a/llama/llama.cpp/include/llama-cpp.h b/llama/llama.cpp/include/llama-cpp.h
new file mode 100644
index 000000000..1500cb2fc
--- /dev/null
+++ b/llama/llama.cpp/include/llama-cpp.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include <memory>
+
+#include "llama.h"
+
+struct llama_model_deleter {
+    void operator()(llama_model * model) { llama_free_model(model); }
+};
+
+struct llama_context_deleter {
+    void operator()(llama_context * context) { llama_free(context); }
+};
+
+struct llama_sampler_deleter {
+    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
+};
+
+struct llama_lora_adapter_deleter {
+    void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
+};
+
+typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
+typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
+typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
+typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
diff --git a/llama/llama.h b/llama/llama.cpp/include/llama.h
similarity index 98%
rename from llama/llama.h
rename to llama/llama.cpp/include/llama.h
index 164d3b6fe..9f411960e 100644
--- a/llama/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef LLAMA_H
 #define LLAMA_H
 
diff --git a/llama/llama-adapter.cpp b/llama/llama.cpp/src/llama-adapter.cpp
similarity index 90%
rename from llama/llama-adapter.cpp
rename to llama/llama.cpp/src/llama-adapter.cpp
index 02a48f3fc..9fd7edea3 100644
--- a/llama/llama-adapter.cpp
+++ b/llama/llama.cpp/src/llama-adapter.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-adapter.h"
 
 #include "llama-model.h"
diff --git a/llama/llama-adapter.h b/llama/llama.cpp/src/llama-adapter.h
similarity index 55%
rename from llama/llama-adapter.h
rename to llama/llama.cpp/src/llama-adapter.h
index 1bf860d7f..5f1870cc8 100644
--- a/llama/llama-adapter.h
+++ b/llama/llama.cpp/src/llama-adapter.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama-impl.h"
diff --git a/llama/llama-arch.cpp b/llama/llama.cpp/src/llama-arch.cpp
similarity index 98%
rename from llama/llama-arch.cpp
rename to llama/llama.cpp/src/llama-arch.cpp
index a6cc790e4..b35aeb315 100644
--- a/llama/llama-arch.cpp
+++ b/llama/llama.cpp/src/llama-arch.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-arch.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama-arch.h b/llama/llama.cpp/src/llama-arch.h
similarity index 89%
rename from llama/llama-arch.h
rename to llama/llama.cpp/src/llama-arch.h
index fa8422a89..e8235ae00 100644
--- a/llama/llama-arch.h
+++ b/llama/llama.cpp/src/llama-arch.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h" // ggml_op
diff --git a/llama/llama-batch.cpp b/llama/llama.cpp/src/llama-batch.cpp
similarity index 91%
rename from llama/llama-batch.cpp
rename to llama/llama.cpp/src/llama-batch.cpp
index 0e0488c3c..8682b0e68 100644
--- a/llama/llama-batch.cpp
+++ b/llama/llama.cpp/src/llama-batch.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-batch.h"
 
 #include <cstring>
diff --git a/llama/llama-batch.h b/llama/llama.cpp/src/llama-batch.h
similarity index 67%
rename from llama/llama-batch.h
rename to llama/llama.cpp/src/llama-batch.h
index eb439c3d0..773c3808b 100644
--- a/llama/llama-batch.h
+++ b/llama/llama.cpp/src/llama-batch.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama-chat.cpp b/llama/llama.cpp/src/llama-chat.cpp
similarity index 95%
rename from llama/llama-chat.cpp
rename to llama/llama.cpp/src/llama-chat.cpp
index 099b33422..44670d3d8 100644
--- a/llama/llama-chat.cpp
+++ b/llama/llama.cpp/src/llama-chat.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-chat.h"
 
 #include "llama.h"
diff --git a/llama/llama-chat.h b/llama/llama.cpp/src/llama-chat.h
similarity index 54%
rename from llama/llama-chat.h
rename to llama/llama.cpp/src/llama-chat.h
index deabed71d..b8e94d9ef 100644
--- a/llama/llama-chat.h
+++ b/llama/llama.cpp/src/llama-chat.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include <string>
diff --git a/llama/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp
similarity index 98%
rename from llama/llama-context.cpp
rename to llama/llama.cpp/src/llama-context.cpp
index 91bfd13f7..9d0e7ca36 100644
--- a/llama/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-context.h"
 
 #include <cassert>
diff --git a/llama/llama-context.h b/llama/llama.cpp/src/llama-context.h
similarity index 80%
rename from llama/llama-context.h
rename to llama/llama.cpp/src/llama-context.h
index 643033946..4980a60e8 100644
--- a/llama/llama-context.h
+++ b/llama/llama.cpp/src/llama-context.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama.cpp/src/llama-cparams.cpp b/llama/llama.cpp/src/llama-cparams.cpp
new file mode 100644
index 000000000..28369be36
--- /dev/null
+++ b/llama/llama.cpp/src/llama-cparams.cpp
@@ -0,0 +1 @@
+#include "llama-cparams.h"
diff --git a/llama/llama.cpp/src/llama-cparams.h b/llama/llama.cpp/src/llama-cparams.h
new file mode 100644
index 000000000..9681e5a08
--- /dev/null
+++ b/llama/llama.cpp/src/llama-cparams.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "llama.h"
+
+#include <cstdint>
+
+struct llama_cparams {
+    uint32_t n_ctx;           // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_ubatch;
+    uint32_t n_seq_max;
+    int      n_threads;       // number of threads to use for generation
+    int      n_threads_batch; // number of threads to use for batch processing
+
+    float rope_freq_base;
+    float rope_freq_scale;
+
+    uint32_t n_ctx_orig_yarn;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+    float defrag_thold;
+
+    bool embeddings;
+    bool causal_attn;
+    bool offload_kqv;
+    bool flash_attn;
+    bool no_perf;
+    bool cross_attn;
+
+    enum llama_pooling_type pooling_type;
+
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
+};
diff --git a/llama/llama-grammar.cpp b/llama/llama.cpp/src/llama-grammar.cpp
similarity index 97%
rename from llama/llama-grammar.cpp
rename to llama/llama.cpp/src/llama-grammar.cpp
index 243cb452c..186dc9a25 100644
--- a/llama/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-grammar.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama-grammar.h b/llama/llama.cpp/src/llama-grammar.h
similarity index 78%
rename from llama/llama-grammar.h
rename to llama/llama.cpp/src/llama-grammar.h
index 41811c742..f8b40c651 100644
--- a/llama/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama-hparams.cpp b/llama/llama.cpp/src/llama-hparams.cpp
similarity index 61%
rename from llama/llama-hparams.cpp
rename to llama/llama.cpp/src/llama-hparams.cpp
index d47225e76..42f8a58ff 100644
--- a/llama/llama-hparams.cpp
+++ b/llama/llama.cpp/src/llama-hparams.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-hparams.h"
 
 #include "ggml.h"
diff --git a/llama/llama-hparams.h b/llama/llama.cpp/src/llama-hparams.h
similarity index 78%
rename from llama/llama-hparams.h
rename to llama/llama.cpp/src/llama-hparams.h
index b2d4bd614..f826cd9ac 100644
--- a/llama/llama-hparams.h
+++ b/llama/llama.cpp/src/llama-hparams.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama-impl.cpp b/llama/llama.cpp/src/llama-impl.cpp
similarity index 82%
rename from llama/llama-impl.cpp
rename to llama/llama.cpp/src/llama-impl.cpp
index de726cb21..a05ba4f63 100644
--- a/llama/llama-impl.cpp
+++ b/llama/llama.cpp/src/llama-impl.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-impl.h"
 
 #include "llama.h"
diff --git a/llama/llama-impl.h b/llama/llama.cpp/src/llama-impl.h
similarity index 58%
rename from llama/llama-impl.h
rename to llama/llama.cpp/src/llama-impl.h
index c9ae33f4a..12d1fb082 100644
--- a/llama/llama-impl.h
+++ b/llama/llama.cpp/src/llama-impl.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h" // for ggml_log_level
diff --git a/llama/llama-kv-cache.cpp b/llama/llama.cpp/src/llama-kv-cache.cpp
similarity index 95%
rename from llama/llama-kv-cache.cpp
rename to llama/llama.cpp/src/llama-kv-cache.cpp
index aa555e652..cf814dbe5 100644
--- a/llama/llama-kv-cache.cpp
+++ b/llama/llama.cpp/src/llama-kv-cache.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-kv-cache.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama-kv-cache.h b/llama/llama.cpp/src/llama-kv-cache.h
similarity index 84%
rename from llama/llama-kv-cache.h
rename to llama/llama.cpp/src/llama-kv-cache.h
index a4d65611a..dca6f3998 100644
--- a/llama/llama-kv-cache.h
+++ b/llama/llama.cpp/src/llama-kv-cache.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama-mmap.cpp b/llama/llama.cpp/src/llama-mmap.cpp
similarity index 93%
rename from llama/llama-mmap.cpp
rename to llama/llama.cpp/src/llama-mmap.cpp
index 3868e9dd8..a99326335 100644
--- a/llama/llama-mmap.cpp
+++ b/llama/llama.cpp/src/llama-mmap.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-mmap.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama-mmap.h b/llama/llama.cpp/src/llama-mmap.h
similarity index 52%
rename from llama/llama-mmap.h
rename to llama/llama.cpp/src/llama-mmap.h
index ebd7dc16e..6bcddee8c 100644
--- a/llama/llama-mmap.h
+++ b/llama/llama.cpp/src/llama-mmap.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include <memory>
diff --git a/llama/llama-model-loader.cpp b/llama/llama.cpp/src/llama-model-loader.cpp
similarity index 97%
rename from llama/llama-model-loader.cpp
rename to llama/llama.cpp/src/llama-model-loader.cpp
index ebb369e49..b12d65666 100644
--- a/llama/llama-model-loader.cpp
+++ b/llama/llama.cpp/src/llama-model-loader.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-model-loader.h"
 
 #include "ggml.h"
diff --git a/llama/llama-model-loader.h b/llama/llama.cpp/src/llama-model-loader.h
similarity index 81%
rename from llama/llama-model-loader.h
rename to llama/llama.cpp/src/llama-model-loader.h
index 873d4c0c5..1ec478195 100644
--- a/llama/llama-model-loader.h
+++ b/llama/llama.cpp/src/llama-model-loader.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama-model.cpp b/llama/llama.cpp/src/llama-model.cpp
similarity index 98%
rename from llama/llama-model.cpp
rename to llama/llama.cpp/src/llama-model.cpp
index 2482f98a3..4f9bbf90e 100644
--- a/llama/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-model.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama-model.h b/llama/llama.cpp/src/llama-model.h
similarity index 91%
rename from llama/llama-model.h
rename to llama/llama.cpp/src/llama-model.h
index 756b09f42..5b23e2baf 100644
--- a/llama/llama-model.h
+++ b/llama/llama.cpp/src/llama-model.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama-quant.cpp b/llama/llama.cpp/src/llama-quant.cpp
similarity index 97%
rename from llama/llama-quant.cpp
rename to llama/llama.cpp/src/llama-quant.cpp
index 6b4d288b0..27def6fd9 100644
--- a/llama/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-quant.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama.cpp/src/llama-quant.h b/llama/llama.cpp/src/llama-quant.h
new file mode 100644
index 000000000..6f70f09be
--- /dev/null
+++ b/llama/llama.cpp/src/llama-quant.h
@@ -0,0 +1 @@
+#pragma once
diff --git a/llama/llama-sampling.cpp b/llama/llama.cpp/src/llama-sampling.cpp
similarity index 98%
rename from llama/llama-sampling.cpp
rename to llama/llama.cpp/src/llama-sampling.cpp
index 1071efdca..69cea2f14 100644
--- a/llama/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-sampling.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama-sampling.h b/llama/llama.cpp/src/llama-sampling.h
similarity index 54%
rename from llama/llama-sampling.h
rename to llama/llama.cpp/src/llama-sampling.h
index 10a7878f3..919f6fdfc 100644
--- a/llama/llama-sampling.h
+++ b/llama/llama.cpp/src/llama-sampling.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
diff --git a/llama/llama-vocab.cpp b/llama/llama.cpp/src/llama-vocab.cpp
similarity index 98%
rename from llama/llama-vocab.cpp
rename to llama/llama.cpp/src/llama-vocab.cpp
index 7f9f699ac..8f44705ab 100644
--- a/llama/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-vocab.h"
 
 #include "llama-impl.h"
diff --git a/llama/llama-vocab.h b/llama/llama.cpp/src/llama-vocab.h
similarity index 84%
rename from llama/llama-vocab.h
rename to llama/llama.cpp/src/llama-vocab.h
index 81b14fff4..0d00086da 100644
--- a/llama/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama.cpp b/llama/llama.cpp/src/llama.cpp
similarity index 99%
rename from llama/llama.cpp
rename to llama/llama.cpp/src/llama.cpp
index 9b123fce8..c95da45d3 100644
--- a/llama/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-impl.h"
 
 #include "llama-chat.h"
diff --git a/llama/llama.cpp/src/llama.go b/llama/llama.cpp/src/llama.go
new file mode 100644
index 000000000..ddbd53785
--- /dev/null
+++ b/llama/llama.cpp/src/llama.go
@@ -0,0 +1,8 @@
+package llama
+
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/../include
+// #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
+// #cgo windows CPPFLAGS: -D_WIN32_WINNT=0x0602
+import "C"
+import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
diff --git a/llama/unicode-data.cpp b/llama/llama.cpp/src/unicode-data.cpp
similarity index 99%
rename from llama/unicode-data.cpp
rename to llama/llama.cpp/src/unicode-data.cpp
index 393cd273b..04dcd7fcf 100644
--- a/llama/unicode-data.cpp
+++ b/llama/llama.cpp/src/unicode-data.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // generated with scripts/gen-unicode-data.py
 
 #include "unicode-data.h"
diff --git a/llama/llama.cpp/src/unicode-data.h b/llama/llama.cpp/src/unicode-data.h
new file mode 100644
index 000000000..f6973ebd2
--- /dev/null
+++ b/llama/llama.cpp/src/unicode-data.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+struct range_nfd {
+    uint32_t first;
+    uint32_t last;
+    uint32_t nfd;
+};
+
+static const uint32_t MAX_CODEPOINTS = 0x110000;
+
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::unordered_set<uint32_t> unicode_set_whitespace;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
diff --git a/llama/unicode.cpp b/llama/llama.cpp/src/unicode.cpp
similarity index 96%
rename from llama/unicode.cpp
rename to llama/llama.cpp/src/unicode.cpp
index 5dcb2e985..6155da800 100644
--- a/llama/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
diff --git a/llama/unicode.h b/llama/llama.cpp/src/unicode.h
similarity index 63%
rename from llama/unicode.h
rename to llama/llama.cpp/src/unicode.h
index b6a99568b..c27098df7 100644
--- a/llama/unicode.h
+++ b/llama/llama.cpp/src/unicode.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include <cstdint>
diff --git a/llama/llama.go b/llama/llama.go
index 18790a95d..1d4513e31 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -1,70 +1,20 @@
 package llama
 
-//go:generate make -j 8
-
 /*
-#cgo CFLAGS: -O3 -std=c17 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
-#cgo CXXFLAGS: -O3 -std=c++17 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
-#cgo amd64,avx CFLAGS: -mavx
-#cgo amd64,avx CXXFLAGS: -mavx
-#cgo amd64,avx2 CFLAGS: -mavx2 -mfma -mf16c
-#cgo amd64,avx2 CXXFLAGS: -mavx2 -mfma -mf16c
-#cgo amd64,avx512 CFLAGS: -mavx512f -mavx512dq -mavx512bw
-#cgo amd64,avx512 CXXFLAGS: -mavx512f -mavx512dq -mavx512bw
-#cgo amd64,avx512bf16 CFLAGS: -mavx512bf16 -D__AVX512BF16__
-#cgo amd64,avx512bf16 CXXFLAGS: -mavx512bf16 -D__AVX512BF16__
-#cgo amd64,avx512vbmi CFLAGS: -mavx512vbmi -D__AVX512VBMI__
-#cgo amd64,avx512vbmi CXXFLAGS: -mavx512vbmi -D__AVX512VBMI__
-#cgo amd64,avx512vnni CFLAGS: -mavx512vnni -D__AVX512VNNI__
-#cgo amd64,avx512vnni CXXFLAGS: -mavx512vnni -D__AVX512VNNI__
-#cgo amd64,f16c CFLAGS: -mf16c
-#cgo amd64,f16c CXXFLAGS: -mf16c
-#cgo amd64,fma CFLAGS: -mfma
-#cgo amd64,fma CXXFLAGS: -mfma
-#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5
-#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6
-#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11
-#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12
-#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
-#cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
-#cgo darwin,amd64 LDFLAGS: -framework Foundation
-#cgo darwin,amd64,avx2 CFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
-#cgo darwin,amd64,avx2 CXXFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
-#cgo darwin,amd64,avx2 LDFLAGS: -framework Accelerate
-#cgo darwin,arm64 CFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
-#cgo darwin,arm64 CXXFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
-#cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
-#cgo linux CFLAGS: -D_GNU_SOURCE
-#cgo linux CXXFLAGS: -D_GNU_SOURCE
-#cgo linux LDFLAGS: -ldl
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux-amd64
-#cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux-arm64
-#cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
-#cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
-#cgo linux,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt -lpthread -lrt -lresolv
-#cgo linux,rocm LDFLAGS: -lpthread -lrt -lresolv
-#cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIP -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIP -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas
-#cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602
-#cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602
-#cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows-amd64
-#cgo windows,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo windows,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/windows-arm64
-#cgo windows,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt
-#cgo windows,rocm LDFLAGS: -lggml_rocm -lhipblas -lamdhip64 -lrocblas
+#cgo CFLAGS: -std=c11
+#cgo CXXFLAGS: -std=c++17
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/src
+#cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
 
 #include <stdlib.h>
+#include "ggml.h"
 #include "llama.h"
 #include "clip.h"
-#include "ggml.h"
 #include "llava.h"
+
 #include "mllama.h"
 #include "sampling_ext.h"
 
@@ -96,9 +46,15 @@ import (
 	"strings"
 	"sync/atomic"
 	"unsafe"
+
+	_ "github.com/ollama/ollama/llama/llama.cpp/common"
+	_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
+	_ "github.com/ollama/ollama/llama/llama.cpp/src"
+	"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )
 
 func BackendInit() {
+	ggml.OnceLoad()
 	C.llama_backend_init()
 }
 
diff --git a/llama/mmq.h b/llama/mmq.h
deleted file mode 100644
index c78d3a1c1..000000000
--- a/llama/mmq.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "common.h"
-
-size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
-
-size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
-
-void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-
-void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/llama/patches/0001-cuda.patch b/llama/patches/0001-cuda.patch
index 574654b5f..0bf338f2b 100644
--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -4,39 +4,44 @@ Date: Thu, 6 Jun 2024 23:55:47 -0700
 Subject: [PATCH] cuda
 
 ---
- ggml/src/ggml-backend.cpp       | 5 +++++
- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++
- 2 files changed, 9 insertions(+)
+ ggml/src/ggml-backend.cpp        | 1 -
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 1 +
+ ggml/src/ggml-metal/ggml-metal.m | 1 +
+ 3 files changed, 2 insertions(+), 1 deletion(-)
 
 diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
-index e2d6c405..1b62c056 100644
+index e2d6c405..a12172dc 100644
 --- a/ggml/src/ggml-backend.cpp
 +++ b/ggml/src/ggml-backend.cpp
-@@ -106,7 +106,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
      if (buffer->iface.free_buffer != NULL) {
          buffer->iface.free_buffer(buffer);
      }
-+
-+// TODO: this needs to be freed in cuda and hip backends because
-+// the cuda backend implementation compiled with msvc
-+#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP)
-     delete buffer;
-+#endif
+-    delete buffer;
  }
  
  size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 0b06be72..0a6ae325 100644
+index 0b06be72..be29e979 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
+@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context {
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
      ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
      delete ctx;
-+
-+    // TODO: this needs to be freed in cuda and hipblas backends because
-+    // the cuda backend implementation compiled with msvc
-+    free(buffer);
++    delete buffer;
  }
  
  static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
+index a85502ee..cd8ef741 100644
+--- a/ggml/src/ggml-metal/ggml-metal.m
++++ b/ggml/src/ggml-metal/ggml-metal.m
+@@ -4187,6 +4187,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
+     }
+ 
+     free(ctx);
++    free(buffer);
+ }
+ 
+ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/llama/patches/0006-conditional-fattn.patch b/llama/patches/0006-conditional-fattn.patch
index 62c248074..739905780 100644
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn
  1 file changed, 2 insertions(+)
 
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 0a6ae325..bb425ee8 100644
+index be29e979..aaa79ea4 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
          case GGML_OP_ARGSORT:
              ggml_cuda_op_argsort(ctx, dst);
              break;
diff --git a/llama/patches/0008-add-mllama-support.patch b/llama/patches/0007-add-mllama-support.patch
similarity index 100%
rename from llama/patches/0008-add-mllama-support.patch
rename to llama/patches/0007-add-mllama-support.patch
diff --git a/llama/patches/0007-blas.patch b/llama/patches/0007-blas.patch
deleted file mode 100644
index 121a1cd95..000000000
--- a/llama/patches/0007-blas.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Mon, 30 Sep 2024 16:31:04 -0700
-Subject: [PATCH] blas
-
----
- ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
-index ec158dfa..b3ac1fa4 100644
---- a/ggml/src/ggml-blas/ggml-blas.cpp
-+++ b/ggml/src/ggml-blas/ggml-blas.cpp
-@@ -1,3 +1,5 @@
-+#ifdef GGML_USE_BLAS
-+
- #include "ggml-impl.h"
- #include "ggml-blas.h"
- #include "ggml-backend-impl.h"
-@@ -515,3 +517,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
- }
- 
- GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
-+
-+#endif // GGML_USE_BLAS
-\ No newline at end of file
diff --git a/llama/patches/0009-add-unpad-operator.patch b/llama/patches/0008-add-unpad-operator.patch
similarity index 97%
rename from llama/patches/0009-add-unpad-operator.patch
rename to llama/patches/0008-add-unpad-operator.patch
index ba857ef0c..fd070df9c 100644
--- a/llama/patches/0009-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
@@ -126,10 +126,10 @@ index b7fefb9d..b307d554 100644
          case GGML_OP_TIMESTEP_EMBEDDING:
          case GGML_OP_ARGSORT:
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index bb425ee8..1e7c2a22 100644
+index aaa79ea4..9286f866 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
          case GGML_OP_PAD:
              ggml_cuda_op_pad(ctx, dst);
              break;
@@ -139,7 +139,7 @@ index bb425ee8..1e7c2a22 100644
          case GGML_OP_ARANGE:
              ggml_cuda_op_arange(ctx, dst);
              break;
-@@ -3013,6 +3016,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+@@ -3010,6 +3013,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
          case GGML_OP_GROUP_NORM:
          case GGML_OP_UPSCALE:
          case GGML_OP_PAD:
@@ -211,10 +211,10 @@ index 8fd386b0..e2ededc3 100644
  void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index a85502ee..84e027eb 100644
+index cd8ef741..318addec 100644
 --- a/ggml/src/ggml-metal/ggml-metal.m
 +++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
+@@ -311,6 +311,7 @@ enum ggml_metal_kernel_type {
      GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
      GGML_METAL_KERNEL_TYPE_PAD_F32,
      GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
@@ -222,7 +222,7 @@ index a85502ee..84e027eb 100644
      GGML_METAL_KERNEL_TYPE_ARANGE_F32,
      GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
      GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
-@@ -910,6 +911,7 @@ @implementation GGMLMetalClass
+@@ -910,6 +911,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                       pad_f32,                        true);
          GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,            pad_reflect_1d_f32,             true);
diff --git a/llama/patches/0010-fix-deepseek-deseret-regex.patch b/llama/patches/0009-fix-deepseek-deseret-regex.patch
similarity index 100%
rename from llama/patches/0010-fix-deepseek-deseret-regex.patch
rename to llama/patches/0009-fix-deepseek-deseret-regex.patch
diff --git a/llama/patches/0012-Maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
similarity index 100%
rename from llama/patches/0012-Maintain-ordering-for-rules-for-grammar.patch
rename to llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
diff --git a/llama/patches/0013-fix-missing-arg-in-static-assert-on-windows.patch b/llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
similarity index 100%
rename from llama/patches/0013-fix-missing-arg-in-static-assert-on-windows.patch
rename to llama/patches/0011-fix-missing-arg-in-static-assert-on-windows.patch
diff --git a/llama/patches/0011-relative-include-paths.patch b/llama/patches/0011-relative-include-paths.patch
deleted file mode 100644
index c1e56b9cf..000000000
--- a/llama/patches/0011-relative-include-paths.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: jmorganca <jmorganca@gmail.com>
-Date: Tue, 3 Dec 2024 21:30:51 -0800
-Subject: [PATCH] relative include paths
-
----
- ggml/src/ggml-cpu/ggml-cpu.c   | 2 +-
- ggml/src/ggml-cpu/ggml-cpu.cpp | 3 +--
- ggml/src/ggml-quants.c         | 2 +-
- 3 files changed, 3 insertions(+), 4 deletions(-)
-
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
-index b307d554..4eb39c52 100644
---- a/ggml/src/ggml-cpu/ggml-cpu.c
-+++ b/ggml/src/ggml-cpu/ggml-cpu.c
-@@ -10,7 +10,7 @@
- #include "ggml-quants.h"
- #include "ggml-cpu-quants.h"
- #include "ggml-threading.h"
--#include "amx/amx.h"
-+#include "amx.h"
- #include "ggml.h"
- 
- #if defined(_MSC_VER) || defined(__MINGW32__)
-diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
-index f11399cc..2a8b40ce 100644
---- a/ggml/src/ggml-cpu/ggml-cpu.cpp
-+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
-@@ -4,8 +4,7 @@
- #include "ggml-cpu-aarch64.h"
- #include "ggml-cpu-traits.h"
- #include "ggml-impl.h"
--#include "amx/amx.h"
--
-+#include "amx.h"
- #include <cctype>
- #include <string>
- #include <vector>
-diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
-index 7918388a..e2ed84e4 100644
---- a/ggml/src/ggml-quants.c
-+++ b/ggml/src/ggml-quants.c
-@@ -3,7 +3,7 @@
- 
- #include "ggml-quants.h"
- #include "ggml-impl.h"
--#include "ggml-cpu/ggml-cpu-impl.h"
-+#include "ggml-cpu-impl.h"
- #include "ggml-cpu.h"
- 
- #include <math.h>
diff --git a/llama/patches/0014-llama-Ensure-KV-cache-is-fully-defragmented.patch b/llama/patches/0012-llama-Ensure-KV-cache-is-fully-defragmented.patch
similarity index 100%
rename from llama/patches/0014-llama-Ensure-KV-cache-is-fully-defragmented.patch
rename to llama/patches/0012-llama-Ensure-KV-cache-is-fully-defragmented.patch
diff --git a/llama/patches/0015-re-enable-gpu-for-clip.patch b/llama/patches/0013-re-enable-gpu-for-clip.patch
similarity index 100%
rename from llama/patches/0015-re-enable-gpu-for-clip.patch
rename to llama/patches/0013-re-enable-gpu-for-clip.patch
diff --git a/llama/patches/0014-sort-devices-by-score.patch b/llama/patches/0014-sort-devices-by-score.patch
new file mode 100644
index 000000000..67c2127a4
--- /dev/null
+++ b/llama/patches/0014-sort-devices-by-score.patch
@@ -0,0 +1,82 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 14 Jan 2025 12:01:24 -0800
+Subject: [PATCH] sort devices by score
+
+---
+ ggml/src/ggml-backend-reg.cpp | 21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
+index 899d16f2..ac5cda07 100644
+--- a/ggml/src/ggml-backend-reg.cpp
++++ b/ggml/src/ggml-backend-reg.cpp
+@@ -150,7 +150,7 @@ struct ggml_backend_reg_entry {
+ 
+ struct ggml_backend_registry {
+     std::vector<ggml_backend_reg_entry> backends;
+-    std::vector<ggml_backend_dev_t> devices;
++    std::vector<std::pair<ggml_backend_dev_t, int>> devices;
+ 
+     ggml_backend_registry() {
+ #ifdef GGML_USE_CUDA
+@@ -195,7 +195,7 @@ struct ggml_backend_registry {
+         }
+     }
+ 
+-    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
++    void register_backend(ggml_backend_reg_t reg, int score = -1, dl_handle_ptr handle = nullptr) {
+         if (!reg) {
+             return;
+         }
+@@ -206,15 +206,15 @@ struct ggml_backend_registry {
+ #endif
+         backends.push_back({ reg, std::move(handle) });
+         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
+-            register_device(ggml_backend_reg_dev_get(reg, i));
++            register_device(ggml_backend_reg_dev_get(reg, i), score);
+         }
+     }
+ 
+-    void register_device(ggml_backend_dev_t device) {
++    void register_device(ggml_backend_dev_t device, int score = -1) {
+ #ifndef NDEBUG
+         GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
+ #endif
+-        devices.push_back(device);
++        devices.push_back({device, score});
+     }
+ 
+     ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
+@@ -257,7 +257,7 @@ struct ggml_backend_registry {
+ 
+         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
+ 
+-        register_backend(reg, std::move(handle));
++        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
+ 
+         return reg;
+     }
+@@ -280,7 +280,7 @@ struct ggml_backend_registry {
+         // remove devices
+         devices.erase(
+             std::remove_if(devices.begin(), devices.end(),
+-                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
++                            [reg](std::pair<ggml_backend_dev_t, int> dev) { return ggml_backend_dev_backend_reg(dev.first) == reg; }),
+             devices.end());
+ 
+         // remove backend
+@@ -338,7 +338,12 @@ size_t ggml_backend_dev_count() {
+ 
+ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
+     GGML_ASSERT(index < ggml_backend_dev_count());
+-    return get_reg().devices[index];
++    auto devices = get_reg().devices;
++    if (!std::is_heap(devices.begin(), devices.end())) {
++        std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
++    }
++
++    return devices[index].first;
+ }
+ 
+ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
diff --git a/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch b/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
new file mode 100644
index 000000000..e68950a57
--- /dev/null
+++ b/llama/patches/0015-add-phony-target-ggml-cpu-for-all-cpu-variants.patch
@@ -0,0 +1,29 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Tue, 14 Jan 2025 15:59:04 -0800
+Subject: [PATCH] add phony target ggml-cpu for all cpu variants
+
+---
+ ggml/src/CMakeLists.txt | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 84101c32..72b488dd 100644
+--- a/ggml/src/CMakeLists.txt
++++ b/ggml/src/CMakeLists.txt
+@@ -278,6 +278,7 @@ function(ggml_add_cpu_backend_variant tag_name)
+     endforeach()
+ 
+     ggml_add_cpu_backend_variant_impl(${tag_name})
++    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
+ endfunction()
+ 
+ ggml_add_backend(CPU)
+@@ -286,6 +287,7 @@ if (GGML_CPU_ALL_VARIANTS)
+     if (NOT GGML_BACKEND_DL)
+         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+     endif()
++    add_custom_target(ggml-cpu)
+     ggml_add_cpu_backend_variant(sandybridge    AVX)
+     ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
+     ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
diff --git a/llama/sgemm.h b/llama/sgemm.h
deleted file mode 100644
index 3d2909515..000000000
--- a/llama/sgemm.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-#include <stdint.h>
-#include <stdbool.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
-                     const void *, int64_t, const void *, int64_t, void *, int64_t,
-                     int, int, int);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/llama/unicode-data.h b/llama/unicode-data.h
deleted file mode 100644
index 4bd020f9a..000000000
--- a/llama/unicode-data.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-struct range_nfd {
-    uint32_t first;
-    uint32_t last;
-    uint32_t nfd;
-};
-
-static const uint32_t MAX_CODEPOINTS = 0x110000;
-
-extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
-extern const std::unordered_set<uint32_t> unicode_set_whitespace;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
-extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
diff --git a/llm/server.go b/llm/server.go
index 89e5f54a6..640c68163 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -29,7 +29,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/runners"
 )
 
 type LlamaServer interface {
@@ -91,8 +90,6 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 // The gpu list must be a single family.
 func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
-	var cpuRunner string
-	var estimate MemoryEstimate
 	var systemTotalMemory uint64
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
@@ -107,12 +104,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	if opts.NumGPU == 0 {
 		gpus = discover.GetCPUInfo()
 	}
-	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		cpuRunner = runners.ServerForCpu()
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
-	} else {
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 
+	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
@@ -120,7 +114,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			cpuRunner = runners.ServerForCpu()
 			gpus = discover.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
@@ -140,36 +133,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
 	estimate.log()
 
-	// Loop through potential servers
-	finalErr := errors.New("no suitable llama servers found")
-
-	availableServers := runners.GetAvailableServers()
-
-	var servers []string
-	if cpuRunner != "" {
-		servers = []string{cpuRunner}
-	} else {
-		servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant
-	}
-	demandLib := envconfig.LLMLibrary()
-	if demandLib != "" {
-		serverPath := availableServers[demandLib]
-		if serverPath == "" {
-			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
-		} else {
-			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
-			servers = []string{demandLib}
-			if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) {
-				// Omit the GPU flag to silence the warning
-				opts.NumGPU = -1
-			}
-		}
-	}
-
-	if len(servers) == 0 {
-		return nil, fmt.Errorf("no servers found for %v", gpus)
-	}
-
 	params := []string{
 		"--model", model,
 		"--ctx-size", strconv.Itoa(opts.NumCtx),
@@ -270,21 +233,49 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		params = append(params, "--multiuser-cache")
 	}
 
-	for i := range servers {
-		builtin := servers[i] == runners.BuiltinName()
-		server := availableServers[servers[i]]
-		if server == "" {
-			// Shouldn't happen
-			finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
-			slog.Error("server list inconsistent", "error", finalErr)
+	// get available libraries
+	if err != nil {
+		return nil, fmt.Errorf("could not get libollama dir: %w", err)
+	}
+
+	entries, err := os.ReadDir(discover.LibOllamaPath)
+	if err != nil {
+		return nil, fmt.Errorf("could not read libollama dir: %w", err)
+	}
+
+	libs := make(map[string]string)
+	for _, entry := range entries {
+		if entry.IsDir() {
+			libs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name())
+		}
+	}
+
+	lib := gpus[0].RunnerName()
+	requested := envconfig.LLMLibrary()
+	if libs[requested] != "" {
+		slog.Info("using requested gpu library", "requested", requested)
+		lib = requested
+	}
+
+	var compatible []string
+	for k := range libs {
+		// exact match first
+		if k == lib {
+			compatible = append([]string{k}, compatible...)
 			continue
 		}
 
-		if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
-			gpus = discover.GetCPUInfo()
+		// then match the family (e.g. 'cuda')
+		if strings.Split(k, "_")[0] == strings.Split(lib, "_")[0] {
+			compatible = append(compatible, k)
 		}
+	}
+	slog.Debug("compatible gpu libraries", "compatible", compatible)
 
-		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
+	// without any LD_LIBRARY_PATH flags
+	for {
 		port := 0
 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
 			var l *net.TCPListener
@@ -305,25 +296,45 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
-		// Start with the server directory for the LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{filepath.Dir(server)}
 
+		var libraryPaths []string
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
-			// favor our bundled library dependencies over system libraries
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}
 
+		if len(compatible) > 0 {
+			c := compatible[0]
+			if libpath, ok := libs[c]; ok {
+				slog.Debug("adding gpu library", "path", libpath)
+				libraryPaths = append(libraryPaths, libpath)
+			}
+		}
+
 		// Note: we always put the dependency path first
 		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != nil {
+			slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
 			// assume gpus from the same library have the same dependency path
 			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
 		}
 
+		// finally, add the root library path
+		libraryPaths = append(libraryPaths, discover.LibOllamaPath)
+
+		exe, err := os.Executable()
+		if err != nil {
+			return nil, fmt.Errorf("unable to lookup executable path: %w", err)
+		}
+
+		exe, err = filepath.EvalSymlinks(exe)
+		if err != nil {
+			return nil, fmt.Errorf("unable to evaluate symlinks for executable path: %w", err)
+		}
+
 		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 		s := &llmServer{
 			port:        port,
-			cmd:         exec.Command(server, finalParams...),
+			cmd:         exec.Command(exe, finalParams...),
 			status:      NewStatusWriter(os.Stderr),
 			options:     opts,
 			modelPath:   model,
@@ -394,17 +405,17 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}
 
 		if err = s.cmd.Start(); err != nil {
-			// Detect permission denied and augment the message about noexec
-			if errors.Is(err, os.ErrPermission) {
-				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, server)
-				continue
-			}
-			msg := ""
+			var msg string
 			if s.status != nil && s.status.LastErrMsg != "" {
 				msg = s.status.LastErrMsg
 			}
-			err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
-			finalErr = err
+			err := fmt.Errorf("error starting runner: %v %s", err, msg)
+			if len(compatible) == 0 {
+				return nil, err
+			}
+
+			slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible)
+			compatible = compatible[1:]
 			continue
 		}
 
@@ -413,7 +424,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			err := s.cmd.Wait()
 			// Favor a more detailed message over the process exit status
 			if err != nil && s.status != nil && s.status.LastErrMsg != "" {
-				slog.Debug("llama runner terminated", "error", err)
+				slog.Error("llama runner terminated", "error", err)
 				if strings.Contains(s.status.LastErrMsg, "unknown model") {
 					s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
 				}
@@ -425,9 +436,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
 		return s, nil
 	}
-
-	slog.Error("unable to load any llama server", "error", finalErr)
-	return nil, finalErr
 }
 
 type ServerStatus int
diff --git a/macapp/forge.config.ts b/macapp/forge.config.ts
index 73ad23e83..d347eed41 100644
--- a/macapp/forge.config.ts
+++ b/macapp/forge.config.ts
@@ -18,8 +18,8 @@ const config: ForgeConfig = {
     asar: true,
     icon: './assets/icon.icns',
     extraResource: [
-      '../dist/ollama',
-      '../dist/darwin-amd64/lib',
+      path.join(__dirname, '../dist/darwin/ollama'),
+      ...fs.readdirSync(path.join(__dirname, '../dist/darwin/amd64')).map(f => path.join(__dirname, '../dist/darwin/amd64', f)),
       path.join(__dirname, './assets/iconTemplate.png'),
       path.join(__dirname, './assets/iconTemplate@2x.png'),
       path.join(__dirname, './assets/iconUpdateTemplate.png'),
@@ -43,7 +43,7 @@ const config: ForgeConfig = {
         }
       : {}),
     osxUniversal: {
-      x64ArchFiles: '**/ollama*',
+      x64ArchFiles: '*',
     },
   },
   rebuildConfig: {},
diff --git a/make/Makefile.cpu b/make/Makefile.cpu
deleted file mode 100644
index 968ae9347..000000000
--- a/make/Makefile.cpu
+++ /dev/null
@@ -1,40 +0,0 @@
-# Build the discrete cpu runner(s) for the platform which do not rely on 3rd party GPU libraries
-
-include make/common-defs.make
-
-CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\"  $(TARGET_LDFLAGS)"
-ifeq ($(ARCH),amd64)
-ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
-	RUNNERS = cpu_avx cpu_avx2
-endif
-endif
-
-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
-
-cpu: $(BUILD_RUNNERS)
-
-dist: $(DIST_RUNNERS)
-
-$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
-$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
-
-$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
-$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
-
-$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
-	@-mkdir -p $(dir $@)
-	cp $< $@
-
-clean:
-	rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS)
-
-.PHONY: clean cpu dist
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
diff --git a/make/Makefile.cuda_v11 b/make/Makefile.cuda_v11
deleted file mode 100644
index a6a81823e..000000000
--- a/make/Makefile.cuda_v11
+++ /dev/null
@@ -1,13 +0,0 @@
-# Build rules for CUDA v11 runner
-
-include make/common-defs.make
-include make/cuda-v11-defs.make
-
-GPU_RUNNER_VARIANT := _v11
-GPU_COMPILER=$(CUDA_11_COMPILER)
-CUDA_ARCHITECTURES?=50;52;53;60;61;62;70;72;75;80;86
-GPU_LIB_DIR = $(CUDA_11_LIB_DIR)
-CGO_EXTRA_LDFLAGS = $(CUDA_11_CGO_EXTRA_LDFLAGS)
-
-include make/cuda.make
-include make/gpu.make
\ No newline at end of file
diff --git a/make/Makefile.cuda_v12 b/make/Makefile.cuda_v12
deleted file mode 100644
index 7c50b27b5..000000000
--- a/make/Makefile.cuda_v12
+++ /dev/null
@@ -1,13 +0,0 @@
-# Build rules for CUDA v12 runner
-
-include make/common-defs.make
-include make/cuda-v12-defs.make
-
-GPU_RUNNER_VARIANT := _v12
-GPU_COMPILER=$(CUDA_12_COMPILER)
-CUDA_ARCHITECTURES?=60;61;62;70;72;75;80;86;87;89;90;90a
-GPU_LIB_DIR = $(CUDA_12_LIB_DIR)
-CGO_EXTRA_LDFLAGS = $(CUDA_12_CGO_EXTRA_LDFLAGS)
-
-include make/cuda.make
-include make/gpu.make
\ No newline at end of file
diff --git a/make/Makefile.ollama b/make/Makefile.ollama
deleted file mode 100644
index a7349a252..000000000
--- a/make/Makefile.ollama
+++ /dev/null
@@ -1,19 +0,0 @@
-# Makefile for building top-level ollama binary
-
-include make/common-defs.make
-
-exe: $(OLLAMA_EXE)
-dist_exe dist_ollama: $(DIST_OLLAMA_EXE)
-
-GO_DEPS=$(foreach dir,$(shell go list -deps -f '{{.Dir}}' . ),$(wildcard $(dir)/*.go))
-CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(EXTRA_GOLDFLAGS) $(TARGET_LDFLAGS)"
-
-$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
-$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): $(COMMON_SRCS) $(COMMON_HDRS) $(GO_DEPS)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ .
-
-.PHONY: ollama dist_ollama exe dist_exe
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
diff --git a/make/Makefile.rocm b/make/Makefile.rocm
deleted file mode 100644
index 26ac6cf35..000000000
--- a/make/Makefile.rocm
+++ /dev/null
@@ -1,119 +0,0 @@
-# Build rules for ROCm runner
-#
-# Note: at present we only support a single ROCm version (whichever is default on the build system)
-# unlike CUDA where we'll build both a v11 and v12 variant.
-
-include make/common-defs.make
-include make/rocm-defs.make
-
-HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
-HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
-
-ifeq ($(OS),windows)
-	GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
-	CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
-	HIP_ARCHS?=$(HIP_ARCHS_COMMON)
-	GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
-	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-else ifeq ($(OS),linux)
-	GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
-	CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
-	HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
-	GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
-	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
-endif
-GPU_COMPILER=$(HIP_COMPILER)
-
-# TODO future multi-variant support for ROCm
-# ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
-# ifneq (,$(ROCM_VERSION))
-# 	GPU_RUNNER_VARIANT = _v$(ROCM_VERSION)
-# endif
-
-GPU_RUNNER_GO_TAGS := rocm
-GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
-GPU_RUNNER_LIBS_SHORT := hipblas rocblas
-
-# Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
-ifeq ($(OS),windows)
-	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
-	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-else ifeq ($(OS),linux)
-	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
-	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
-	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
-	FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
-	GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
-endif
-GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
-ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt
-
-ifeq ($(OS),linux)
-	GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++17
-else ifeq ($(OS),windows)
-	GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
-endif
-GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
-
-# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
-GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
-
-GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_FPIC) \
-	$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
-	-mf16c \
-	-mfma \
-	-c \
-	-O3 \
-	-DGGML_USE_CUDA \
-	-DGGML_BUILD=1 \
-	-DGGML_BACKEND_BUILD=1 \
-	-DGGML_SHARED=1 \
-	-DGGML_BACKEND_SHARED=1 \
-	-DGGML_CUDA_DMMV_X=32 \
-	-DGGML_CUDA_MMV_Y=1 \
-	-DGGML_SCHED_MAX_COPIES=4 \
-	-DGGML_USE_HIP \
-	-DGGML_USE_LLAMAFILE \
-	-DHIP_FAST_MATH \
-	-D__HIP_PLATFORM_AMD__=1 \
-	-D__HIP_ROCclr__=1 \
-	-DNDEBUG \
-	-DK_QUANTS_PER_ITERATION=2 \
-	-D_CRT_SECURE_NO_WARNINGS \
-	-D_GNU_SOURCE \
-	-D_XOPEN_SOURCE=600 \
-	-DUSE_PROF_API=1 \
-	-std=gnu++17 \
-	-x hip \
-	-mllvm=-amdgpu-early-inline-all=true \
-	-mllvm=-amdgpu-function-calls=false \
-	-Wno-expansion-to-defined \
-	-Wno-invalid-noreturn \
-	-Wno-ignored-attributes \
-	-Wno-pass-failed \
-	-Wno-deprecated-declarations \
-	-Wno-unused-result \
-	-I./llama/
-
-# Workaround buggy P2P copy on some windows multi-GPU setups
-# This workaround breaks linux systems with small system RAM, so only enable on windows
-ifeq ($(OS),windows)
-	GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
-endif
-
-include make/gpu.make
-
-# Adjust the rules from gpu.make to handle the ROCm dependencies properly
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
-$(ROCBLAS_DIST_DEP_MANIFEST):
-	@-mkdir -p $(dir $@)
-	@echo "Copying rocblas library..."
-	(cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
-	@echo "rocblas library copy complete"
-
-$(GPU_DIST_TRANSITIVE_LIB_DEPS):
-	@-mkdir -p $(dir $@)
-	$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
diff --git a/make/Makefile.sync b/make/Makefile.sync
deleted file mode 100644
index 628d30e0b..000000000
--- a/make/Makefile.sync
+++ /dev/null
@@ -1,250 +0,0 @@
-# Helpers for managing our vendored llama.cpp repo and patch set
-
-REPO_ROOT:=./
-DEST_DIR:=./llama/
-
-include $(DEST_DIR)vendoring
-
-LLAMACPP_REPO := ./llama/vendor/
-
-# Relative to the vendor dir
-VENDOR_RELATIVE_PATCH_DIR := ../patches/
-
-
-help-sync:
-	@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
-	@echo ""
-	@echo "	make apply-patches	# Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
-	@echo "	make sync		# Vendor llama.cpp and ggml from the tracking repo working tree"
-	@echo "	make sync-clean		# Remove all vendored files"
-	@echo "	make create-patches	# Generate the patch set based on the current commits in the tracking repo since the base commit"
-	@echo ""
-	@echo "For more details on the workflow, see the Vendoring section in 'docs/development.md'"
-
-apply-patches: $(LLAMACPP_REPO)
-	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
-  		echo "ERROR: Your llama.cpp repo is dirty.  The apply-patches target requires a clean working tree"; \
-		echo "To clobber: git -C $(LLAMACPP_REPO) reset --hard HEAD" ; \
-  		exit 1; \
-	fi
-	@echo "Checking out $(LLAMACPP_BASE_COMMIT)"
-	@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
-		git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
-	@echo "Applying ollama patches..."
-	@cd $(LLAMACPP_REPO) && git -c 'user.name=nobody' -c 'user.email=<>' am -3 $(VENDOR_RELATIVE_PATCH_DIR)*.patch || \
-		echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
-	@echo ""
-	@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
-	@echo "Don't forget to commit any changes you make and run 'make create-patches' "
-	
-$(LLAMACPP_REPO):
-	@echo "Cloning llama.cpp to $(LLAMACPP_REPO)"
-	git clone https://github.com/ggerganov/llama.cpp.git $@
-
-create-patches: $(LLAMACPP_REPO)
-	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
-  		echo "ERROR: Your llama.cpp repo is dirty.  You must commit any pending changes for format-patch to generate patches"; \
-  		exit 1; \
-	fi
-	@cd $(LLAMACPP_REPO) && git format-patch --no-signature --no-numbered --zero-commit -o $(VENDOR_RELATIVE_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
-
-# Vendoring template logic
-EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
-OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c sampling_ext.cpp sampling_ext.h
-define vendor_file
-$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
-ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
-	@echo "vendoring $1"; \
-		mkdir -p $$(dir $$@) && \
-		echo "/**" > $$@ && \
-		echo " * llama.cpp - commit $$(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $$@ && \
-		echo " *" >> $$@ && \
-		sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$$$//' >> $$@ && \
-		echo " */" >> $$@ && \
-		echo "" >> $$@ && \
-		cat $$< >> $$@
-else
-	@echo "vendoring $1"; \
-		mkdir -p $$(dir $$@) && \
-		cat $$< > $$@
-endif
-VENDORED_FILES += $(strip $(addprefix $(2),$(notdir $1)))
-endef
-
-# llama.cpp files -> llama/
-LLAMACPP_FILES=\
-	src/unicode.cpp \
-	src/unicode.h \
-	src/unicode-data.cpp \
-	src/unicode-data.h \
-	src/llama.cpp \
-	src/llama-adapter.cpp \
-	src/llama-adapter.h \
-	src/llama-arch.cpp \
-	src/llama-arch.h \
-	src/llama-batch.cpp \
-	src/llama-batch.h \
-	src/llama-chat.cpp \
-	src/llama-chat.h \
-	src/llama-context.cpp \
-	src/llama-context.h \
-	src/llama-cparams.cpp \
-	src/llama-cparams.h \
-	src/llama-grammar.cpp \
-	src/llama-grammar.h \
-	src/llama-hparams.cpp \
-	src/llama-hparams.h \
-	src/llama-impl.cpp \
-	src/llama-impl.h \
-	src/llama-kv-cache.cpp \
-	src/llama-kv-cache.h \
-	src/llama-mmap.cpp \
-	src/llama-mmap.h \
-	src/llama-model-loader.cpp \
-	src/llama-model-loader.h \
-	src/llama-model.cpp \
-	src/llama-model.h \
-	src/llama-quant.cpp \
-	src/llama-quant.h \
-	src/llama-sampling.cpp \
-	src/llama-sampling.h \
-	src/llama-vocab.cpp \
-	src/llama-vocab.h \
-	include/llama.h \
-	include/llama-cpp.h \
-	ggml/include/ggml-cpu.h \
-	ggml/src/ggml-cpu/llamafile/sgemm.cpp \
-	ggml/src/ggml-cpu/llamafile/sgemm.h
-$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
-
-# llama.cpp files -> llama/llamafile
-LLAMAFILE_FILES= \
-	ggml/src/ggml-cpu/llamafile/sgemm.h
-$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)llamafile/)))
-
-# ggml files -> llama/
-GGML_FILES= \
-	ggml/src/ggml.c \
-	ggml/include/ggml.h \
-	ggml/src/ggml-quants.c \
-	ggml/src/ggml-quants.h \
-	ggml/src/ggml-metal/ggml-metal.metal \
-	ggml/include/ggml-metal.h \
-	ggml/src/ggml-impl.h \
-	ggml/src/ggml-threading.h \
-	ggml/include/ggml-cuda.h \
-	ggml/src/ggml-backend-reg.cpp \
-	ggml/src/ggml-metal/ggml-metal-impl.h \
-	ggml/src/ggml-common.h \
-	ggml/include/ggml-backend.h \
-	ggml/src/ggml-backend.cpp \
-	ggml/src/ggml-backend-impl.h \
-	ggml/include/ggml-alloc.h \
-	ggml/src/ggml-alloc.c \
-	ggml/include/ggml-blas.h \
-	ggml/include/ggml-cpp.h \
-	ggml/src/ggml-threading.cpp \
-	ggml/src/ggml-blas/ggml-blas.cpp \
-	ggml/src/ggml-cpu/ggml-cpu.c \
-	ggml/src/ggml-cpu/ggml-cpu.cpp \
-	ggml/src/ggml-cpu/ggml-cpu-aarch64.h \
-	ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp \
-	ggml/src/ggml-cpu/ggml-cpu-quants.h \
-	ggml/src/ggml-cpu/ggml-cpu-quants.c \
-	ggml/src/ggml-cpu/ggml-cpu-impl.h \
-	ggml/src/ggml-cpu/ggml-cpu-traits.h \
-	ggml/src/ggml-cpu/ggml-cpu-traits.cpp \
-	ggml/src/ggml-cpu/amx/amx.h \
-	ggml/src/ggml-cpu/amx/amx.cpp \
-	ggml/src/ggml-cpu/amx/mmq.cpp \
-	ggml/src/ggml-cpu/amx/mmq.h
-$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
-
-$(DEST_DIR)ggml-metal-embed.metal: $(DEST_DIR)ggml-common.h $(DEST_DIR)ggml-metal-impl.h
-	@sed -e '/__embed_ggml-common.h__/r $(DEST_DIR)/ggml-common.h' \
-	     -e '/__embed_ggml-common.h__/d' \
-	     < $(DEST_DIR)/ggml-metal.metal \
-	     > $(DEST_DIR)/ggml-metal-embed.metal.tmp
-	@sed -e '/#include "ggml-metal-impl.h"/r $(DEST_DIR)/ggml-metal-impl.h' \
-	     -e '/#include "ggml-metal-impl.h"/d' \
-	     < $(DEST_DIR)/ggml-metal-embed.metal.tmp \
-	     > $(DEST_DIR)/ggml-metal-embed.metal
-	@rm $(DEST_DIR)/ggml-metal-embed.metal.tmp
-
-VENDORED_FILES += $(DEST_DIR)ggml-metal-embed.metal
-
-# TODO generalize renaming pattern if we have more of these
-$(DEST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal/ggml-metal.m
-	@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
-		mkdir -p $(dir $@) && \
-		echo "/**" > $@ && \
-		echo " * llama.cpp - commit $(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $@ && \
-		echo " *" >> $@ && \
-		sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$//' >> $@ && \
-		echo " */" >> $@ && \
-		echo "" >> $@ && \
-		cat $< >> $@
-VENDORED_FILES += $(DEST_DIR)ggml-metal_darwin_arm64.m
-
-# ggml-cuda -> llama/ggml-cuda/
-GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
-GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
-$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/)))
-
-GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
-GGML_TEMPLATE_FILES_EXPANDED = 	$(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
-$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/template-instances/)))
-
-GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
-GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
-$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/)))
-
-# llava -> llama/
-LAVA_FILES= \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	common/log.h \
-	common/log.cpp \
-	common/stb_image.h
-# These files are mostly used by the llava code
-# and shouldn't be necessary once we use clip.cpp directly
-LAVA_FILES+= \
-	common/common.cpp \
-	common/common.h \
-	common/sampling.cpp \
-	common/sampling.h \
-	common/json.hpp \
-	common/json-schema-to-grammar.cpp \
-	common/json-schema-to-grammar.h \
-	common/base64.hpp
-$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
-
-$(DEST_DIR)build-info.cpp:
-	@echo "Generating $@"
-	@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
-	@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
-	@echo "char const *LLAMA_COMPILER = \"\";" >> $@
-	@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
-VENDORED_FILES += $(DEST_DIR)build-info.cpp
-
-
-sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files
-
-sync-clean:
-	rm -f $(VENDORED_FILES) $(EXTRA_NATIVE_FILES)
-
-PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
-NATIVE_DIRS=$(DEST_DIR) $(DEST_DIR)llamafile/ $(DEST_DIR)ggml-cuda/ $(DEST_DIR)ggml-cuda/template-instances/ $(DEST_DIR)ggml-cuda/vendors/
-ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
-EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DEST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
-remove-stale-files:
-	@rm -f $(EXTRA_NATIVE_FILES)
-
-.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT
-
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
diff --git a/make/Makefile.test b/make/Makefile.test
deleted file mode 100644
index 3b27d0dbe..000000000
--- a/make/Makefile.test
+++ /dev/null
@@ -1,19 +0,0 @@
-# Targets to assist in running tests
-
-include make/common-defs.make
-
-test:
-	cd .. && go test ./... 
-
-integration: $(OLLAMA_EXE)
-	cd .. && go test --tags=integration ./integration -v
-
-lint:
-	cd .. && golangci-lint run -v
-
-# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
-$(OLLAMA_EXE):
-	@echo ""
-	@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
-	@echo ""
-	@exit 1
\ No newline at end of file
diff --git a/make/common-defs.make b/make/common-defs.make
deleted file mode 100644
index 03504a690..000000000
--- a/make/common-defs.make
+++ /dev/null
@@ -1,91 +0,0 @@
-# Common definitions for the various Makefiles
-# No rules are defined here so this is safe to include at the beginning of other makefiles
-
-OS := $(shell uname -s)
-ARCH ?= $(subst aarch64,arm64,$(subst x86_64,amd64,$(shell uname -m)))
-ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
-	OS := windows
-	ARCH := $(shell systeminfo 2>/dev/null | grep "System Type" | grep ARM64 > /dev/null && echo "arm64" || echo "amd64" )
-else ifeq ($(OS),Linux)
-	OS := linux
-else ifeq ($(OS),Darwin)
-	OS := darwin
-endif
-comma:= ,
-empty:=
-space:= $(empty) $(empty)
-uc = $(subst a,A,$(subst b,B,$(subst c,C,$(subst d,D,$(subst e,E,$(subst f,F,$(subst g,G,$(subst h,H,$(subst i,I,$(subst j,J,$(subst k,K,$(subst l,L,$(subst m,M,$(subst n,N,$(subst o,O,$(subst p,P,$(subst q,Q,$(subst r,R,$(subst s,S,$(subst t,T,$(subst u,U,$(subst v,V,$(subst w,W,$(subst x,X,$(subst y,Y,$(subst z,Z,$1))))))))))))))))))))))))))
-
-export CGO_CFLAGS_ALLOW = -mfma|-mf16c
-export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
-export HIP_PLATFORM = amd
-export CGO_ENABLED=1
-
-BUILD_DIR = ./llama/build/$(OS)-$(ARCH)
-DIST_BASE = ./dist/$(OS)-$(ARCH)
-
-ifeq ($(OS),windows)
-	# Absolute paths with cygpath to convert to 8.3 without spaces
-	PWD="$(shell pwd)"
-	DIST_OLLAMA_EXE=$(DIST_BASE)/ollama$(EXE_EXT)
-else
-	CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
-	DIST_OLLAMA_EXE=$(DIST_BASE)/bin/ollama$(EXE_EXT)
-endif
-DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
-RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
-RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
-VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")
-
-# Conditionally enable ccache for cgo builds too
-ifneq ($(CCACHE),)
-	CC?=$(CCACHE) gcc
-	CXX?=$(CCACHE) g++
-	export CC
-	export CXX
-endif
-
-
-# Override in environment to tune CPU vector flags
-ifeq ($(ARCH),amd64)
-ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
-	GPU_RUNNER_CPU_FLAGS=avx
-	GPU_RUNNER_EXTRA_VARIANT=_avx
-else
-	GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
-endif
-endif
-
-ifeq ($(OS),windows)
-	CP := cp
-	OBJ_EXT := obj
-	SHARED_EXT := dll
-	EXE_EXT := .exe
-	SHARED_PREFIX := 
-	CPU_FLAG_PREFIX := /arch:
-ifneq ($(HIP_PATH),)
-	# If HIP_PATH has spaces, hipcc trips over them when subprocessing
-	HIP_PATH := $(shell cygpath -m -s "$(patsubst %\,%,$(HIP_PATH))")
-	export HIP_PATH
-endif
-else ifeq ($(OS),linux)
-	CP := cp -df
-	OBJ_EXT := o
-	SHARED_EXT := so
-	SHARED_PREFIX := lib
-	CPU_FLAG_PREFIX := -m
-else
-	OBJ_EXT := o
-	SHARED_EXT := so
-	CPU_FLAG_PREFIX := -m
-	CP := cp -df
-endif
-
-COMMON_SRCS := \
-	$(wildcard ./llama/*.c) \
-	$(wildcard ./llama/*.cpp)
-COMMON_HDRS := \
-	$(wildcard ./llama/*.h) \
-	$(wildcard ./llama/*.hpp)
-
-OLLAMA_EXE=./ollama$(EXE_EXT)
\ No newline at end of file
diff --git a/make/cuda-v11-defs.make b/make/cuda-v11-defs.make
deleted file mode 100644
index 264407ddc..000000000
--- a/make/cuda-v11-defs.make
+++ /dev/null
@@ -1,17 +0,0 @@
-# Common definitions for the various Makefiles which set cuda settings
-# No rules are defined here so this is safe to include at the beginning of other makefiles
-
-ifeq ($(OS),windows)
-	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
-	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
-	CUDA_11_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
-	CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc.exe)
-	CUDA_11_LIB_DIR = $(strip $(shell ls -d $(CUDA_11_PATH)/bin 2>/dev/null))
-	CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_PATH)/lib/x64"
-else ifeq ($(OS),linux)
-	CUDA_PATH?=/usr/local/cuda
-	CUDA_11_PATH:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
-	CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc)
-	CUDA_11_LIB_DIR=$(strip $(shell ls -d $(CUDA_11_PATH)/lib64 2>/dev/null || ls -d $(CUDA_11_PATH)/lib 2>/dev/null))
-	CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_LIB_DIR)" -L"$(CUDA_11_LIB_DIR)/stubs"
-endif
diff --git a/make/cuda-v12-defs.make b/make/cuda-v12-defs.make
deleted file mode 100644
index f7c182b6f..000000000
--- a/make/cuda-v12-defs.make
+++ /dev/null
@@ -1,17 +0,0 @@
-# Common definitions for the various Makefiles which set cuda settings
-# No rules are defined here so this is safe to include at the beginning of other makefiles
-
-ifeq ($(OS),windows)
-	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
-	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
-	CUDA_12_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
-	CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc.exe)
-	CUDA_12_LIB_DIR = $(strip $(shell ls -d $(CUDA_12_PATH)/bin 2>/dev/null))
-	CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_PATH)/lib/x64"
-else ifeq ($(OS),linux)
-	CUDA_PATH?=/usr/local/cuda
-	CUDA_12_PATH:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
-	CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc)
-	CUDA_12_LIB_DIR=$(strip $(shell ls -d $(CUDA_12_PATH)/lib64 2>/dev/null || ls -d $(CUDA_12_PATH)/lib 2>/dev/null))
-	CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_LIB_DIR)" -L"$(CUDA_12_LIB_DIR)/stubs" 
-endif
diff --git a/make/cuda.make b/make/cuda.make
deleted file mode 100644
index 095663f5d..000000000
--- a/make/cuda.make
+++ /dev/null
@@ -1,56 +0,0 @@
-# Common definitions for all cuda versions
-
-ifndef GPU_RUNNER_VARIANT
-dummy:
-	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
-endif
-
-
-GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
-GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
-
-ifeq ($(OS),windows)
-	# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
-	GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
-	GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
-	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
-	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
-	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-	GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
-	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-else ifeq ($(OS),linux)
-	# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
-	GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
-	GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++17
-	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-	GPU_COMPILER_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-endif
-GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
-
-GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
-	-DGGML_CUDA_USE_GRAPHS=1
-GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_EXTRA_FLAGS) \
-	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
-	-t2 \
-	-DGGML_CUDA_DMMV_X=32 \
-	-DGGML_CUDA_MMV_Y=1 \
-	-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-	-DGGML_USE_CUDA=1 \
-	-DGGML_SHARED=1 \
-	-DGGML_BACKEND_SHARED=1 \
-	-DGGML_BUILD=1 \
-	-DGGML_BACKEND_BUILD=1 \
-	-DGGML_USE_LLAMAFILE \
-	-DK_QUANTS_PER_ITERATION=2 \
-	-DNDEBUG \
-	-D_GNU_SOURCE \
-	-D_XOPEN_SOURCE=600 \
-	-Wno-deprecated-gpu-targets \
-	--forward-unknown-to-host-compiler \
-	-use_fast_math \
-	-I./llama/  \
-	-O3
diff --git a/make/gpu.make b/make/gpu.make
deleted file mode 100644
index 96e1ad224..000000000
--- a/make/gpu.make
+++ /dev/null
@@ -1,89 +0,0 @@
-# Generalized GPU runner build
-
-ifndef GPU_RUNNER_NAME
-dummy:
-	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
-endif
-
-GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" $(EXTRA_GOLDFLAGS) $(TARGET_LDFLAGS)"
-
-# TODO Unify how we handle dependencies in the dist/packaging and install flow
-# today, cuda is bundled, but rocm is split out.  Should split them each out by runner
-DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
-
-
-GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-
-GPU_RUNNER_SRCS := \
-	$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
-	$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
-	llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-threading.cpp
-GPU_RUNNER_HDRS := \
-	$(wildcard llama/ggml-cuda/*.cuh)
-
-
-# Conditional flags and components to speed up developer builds
-ifneq ($(OLLAMA_FAST_BUILD),)
-	GPU_COMPILER_CUFLAGS += 	\
-		-DGGML_DISABLE_FLASH_ATTN
-else
-	GPU_RUNNER_SRCS += \
-		$(wildcard llama/ggml-cuda/fattn*.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
-endif
-
-GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
-
-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
-
-
-$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) 
-
-dist: $(DIST_RUNNERS)
-
-# Build targets
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
-
-# Distribution targets
-$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
-	@-mkdir -p $(dir $@)
-	$(CP) $< $@
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
-	@-mkdir -p $(dir $@)
-	$(CP) $< $@
-$(GPU_DIST_LIB_DEPS):
-	@-mkdir -p $(dir $@)
-	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
-
-clean: 
-	rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)
-
-.PHONY: clean $(GPU_RUNNER_NAME)
-
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
-
diff --git a/make/rocm-defs.make b/make/rocm-defs.make
deleted file mode 100644
index 76a11f296..000000000
--- a/make/rocm-defs.make
+++ /dev/null
@@ -1,9 +0,0 @@
-# Common definitions for the various Makefiles which set cuda settings
-# No rules are defined here so this is safe to include at the beginning of other makefiles
-
-ifeq ($(OS),windows)
-	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe)
-else ifeq ($(OS),linux)
-	HIP_PATH?=$(shell ls -d /opt/rocm 2>/dev/null)
-	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc)
-endif
diff --git a/ml/backend/ggml/ggml/.rsync-filter b/ml/backend/ggml/ggml/.rsync-filter
new file mode 100644
index 000000000..c5acbe490
--- /dev/null
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -0,0 +1,22 @@
+protect *.go
+protect *-embed.*
+include include/
+include src/
+include src/CMakeLists.txt
+include src/**/CMakeLists.txt
+include src/ggml-blas/
+include src/ggml-cpu/
+include src/ggml-cpu/amx/
+include src/ggml-cpu/llamafile/
+include src/ggml-cuda/
+include src/ggml-cuda/template-instances/
+include src/ggml-hip/
+include src/ggml-metal/
+include *.c
+include *.h
+include *.cpp
+include *.cu
+include *.cuh
+include *.m
+include *.metal
+exclude *
diff --git a/ml/backend/ggml/ggml/LICENSE b/ml/backend/ggml/ggml/LICENSE
new file mode 100644
index 000000000..acb96ce78
--- /dev/null
+++ b/ml/backend/ggml/ggml/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023-2024 The ggml authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/llama/ggml-alloc.h b/ml/backend/ggml/ggml/include/ggml-alloc.h
similarity index 70%
rename from llama/ggml-alloc.h
rename to ml/backend/ggml/ggml/include/ggml-alloc.h
index 960ebf301..23600eea9 100644
--- a/llama/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h"
diff --git a/llama/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h
similarity index 94%
rename from llama/ggml-backend.h
rename to ml/backend/ggml/ggml/include/ggml-backend.h
index b67a183f5..7221a0830 100644
--- a/llama/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h"
diff --git a/ml/backend/ggml/ggml/include/ggml-blas.h b/ml/backend/ggml/ggml/include/ggml-blas.h
new file mode 100644
index 000000000..87a81b363
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-blas.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
+
+// number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ml/backend/ggml/ggml/include/ggml-cann.h b/ml/backend/ggml/ggml/include/ggml-cann.h
new file mode 100644
index 000000000..b469e228d
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-cann.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Maximum number of CANN devices supported.
+ */
+#define GGML_CANN_MAX_DEVICES 16
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+
+/**
+ * @brief Initializes the CANN backend for a specified device.
+ *
+ * This function initializes the CANN backend for the given device.
+ * It verifies the device index, allocates a context, and creates a backend
+ * instance.
+ *
+ * @param device The index of the device to initialize.
+ * @return A pointer to the initialized backend instance, or nullptr on failure.
+ */
+GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+
+/**
+ * @brief Checks if a given backend is a CANN backend.
+ *
+ * This function verifies if the provided backend is a CANN backend by comparing
+ * its GUID with the CANN backend's GUID.
+ *
+ * @param backend The backend instance to check.
+ * @return True if the backend is a CANN backend, false otherwise.
+ */
+GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
+
+/**
+ * @brief Retrieves the CANN buffer type for a specified device.
+ *
+ * This function initializes and returns the buffer type interface associated
+ * with the given device. It ensures thread-safe access using a mutex.
+ *
+ * @param device The device index for which to retrieve the buffer type.
+ * @return A pointer to the buffer type interface for the specified device, or
+ * nullptr if the device index is out of range.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t
+ggml_backend_cann_buffer_type(int32_t device);
+
+/**
+ * @brief Retrieves the number of CANN devices available.
+ *
+ * This function returns the number of CANN devices available based on
+ * information obtained from `ggml_cann_info()`.
+ *
+ * @return The number of CANN devices available.
+ */
+GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
+
+/**
+ * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
+ *
+ * @return A pointer to the host buffer type interface.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+
+/**
+ * @brief Retrieves the description of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the SoC name,
+ * and writes it into the provided description buffer.
+ *
+ * @param device The device index to retrieve the description for.
+ * @param description Pointer to a buffer where the description will be written.
+ * @param description_size Size of the description buffer.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_description(
+    int32_t device, char* description, size_t description_size);
+
+/**
+ * @brief Retrieves the memory information of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the free and total
+ * memory information of the specified type (ACL_HBM_MEM), and stores them
+ * in the provided pointers.
+ *
+ * @param device The device index to retrieve memory information for.
+ * @param free Pointer to a variable where the free memory size will be stored.
+ * @param total Pointer to a variable where the total memory size will be
+ * stored.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
+                                                  size_t* free,
+                                                  size_t* total);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama/ggml-cpp.h b/ml/backend/ggml/ggml/include/ggml-cpp.h
similarity index 56%
rename from llama/ggml-cpp.h
rename to ml/backend/ggml/ggml/include/ggml-cpp.h
index ceb54875d..219361af4 100644
--- a/llama/ggml-cpp.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpp.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #ifndef __cplusplus
diff --git a/llama/ggml-cpu.h b/ml/backend/ggml/ggml/include/ggml-cpu.h
similarity index 84%
rename from llama/ggml-cpu.h
rename to ml/backend/ggml/ggml/include/ggml-cpu.h
index c2b64e662..3aa71badb 100644
--- a/llama/ggml-cpu.h
+++ b/ml/backend/ggml/ggml/include/ggml-cpu.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h"
diff --git a/llama/ggml-cuda.h b/ml/backend/ggml/ggml/include/ggml-cuda.h
similarity index 56%
rename from llama/ggml-cuda.h
rename to ml/backend/ggml/ggml/include/ggml-cuda.h
index c0fb681e9..22ad2c009 100644
--- a/llama/ggml-cuda.h
+++ b/ml/backend/ggml/ggml/include/ggml-cuda.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h"
diff --git a/ml/backend/ggml/ggml/include/ggml-kompute.h b/ml/backend/ggml/ggml/include/ggml-kompute.h
new file mode 100644
index 000000000..154aa56a7
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-kompute.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_KOMPUTE_MAX_DEVICES 16
+
+struct ggml_vk_device {
+    int index;
+    int type; // same as VkPhysicalDeviceType
+    size_t heapSize;
+    const char * name;
+    const char * vendor;
+    int subgroupSize;
+    uint64_t bufferAlignment;
+    uint64_t maxAlloc;
+};
+
+struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
+bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
+bool ggml_vk_has_vulkan(void);
+bool ggml_vk_has_device(void);
+struct ggml_vk_device ggml_vk_current_device(void);
+
+//
+// backend API
+//
+
+// forward declaration
+typedef struct ggml_backend * ggml_backend_t;
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama/ggml-metal.h b/ml/backend/ggml/ggml/include/ggml-metal.h
similarity index 66%
rename from llama/ggml-metal.h
rename to ml/backend/ggml/ggml/include/ggml-metal.h
index c3e7023e4..669c1f84a 100644
--- a/llama/ggml-metal.h
+++ b/ml/backend/ggml/ggml/include/ggml-metal.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // Note: this description is outdated
 //
 // An interface allowing to compute ggml_cgraph with Metal
diff --git a/ml/backend/ggml/ggml/include/ggml-opencl.h b/ml/backend/ggml/ggml/include/ggml-opencl.h
new file mode 100644
index 000000000..6b6177135
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-opencl.h
@@ -0,0 +1,26 @@
+#ifndef GGML_OPENCL_H
+#define GGML_OPENCL_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL_H
diff --git a/ml/backend/ggml/ggml/include/ggml-opt.h b/ml/backend/ggml/ggml/include/ggml-opt.h
new file mode 100644
index 000000000..eb5eab9de
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-opt.h
@@ -0,0 +1,216 @@
+// This file contains functionality for training models using GGML.
+// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
+// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
+//
+// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdint.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    struct ggml_opt_dataset;
+    struct ggml_opt_context;
+    struct ggml_opt_result;
+
+    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
+    typedef struct ggml_opt_context * ggml_opt_context_t;
+    typedef struct ggml_opt_result  * ggml_opt_result_t;
+
+    // ====== Loss ======
+
+    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
+    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
+    enum ggml_opt_loss_type {
+        GGML_OPT_LOSS_TYPE_MEAN,
+        GGML_OPT_LOSS_TYPE_SUM,
+        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
+        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+    };
+
+    // ====== Dataset ======
+
+    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
+            int64_t ne_datapoint, // number of elements per datapoint
+            int64_t ne_label,     // number of elements per label
+            int64_t ndata,        // total number of datapoints/labels
+            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
+
+    // get underlying tensors that store the data
+    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
+    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
+
+    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
+    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
+
+    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
+    GGML_API void ggml_opt_dataset_get_batch(
+            ggml_opt_dataset_t   dataset,
+            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
+            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
+            int64_t              ibatch);
+
+    // ====== Model / Context ======
+
+    enum ggml_opt_build_type {
+        GGML_OPT_BUILD_TYPE_FORWARD,
+        GGML_OPT_BUILD_TYPE_GRAD,
+        GGML_OPT_BUILD_TYPE_OPT,
+    };
+
+    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
+    struct ggml_opt_optimizer_params {
+        // AdamW optimizer parameters
+        struct {
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float wd;    // weight decay for AdamW, use 0.0f to disable
+        } adamw;
+    };
+
+    // callback to calculate optimizer parameters prior to a backward pass
+    // userdata can be used to pass arbitrary data
+    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
+
+    // returns the default optimizer params (constant)
+    // userdata is not used
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
+
+    // parameters for initializing a new optimization context
+    struct ggml_opt_params {
+        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
+
+        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
+
+        // the forward graph is defined by inputs and outputs
+        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
+        struct ggml_tensor * inputs;
+        struct ggml_tensor * outputs;
+
+        enum ggml_opt_loss_type  loss_type;
+        enum ggml_opt_build_type build_type;
+
+        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
+
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+    };
+
+    // get parameters for an optimization context with defaults set where possible
+    // parameters for which no sensible defaults exist are supplied as arguments to this function
+    GGML_API ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t      backend_sched,
+            struct ggml_context     * ctx_compute,
+            struct ggml_tensor      * inputs,
+            struct ggml_tensor      * outputs,
+            enum ggml_opt_loss_type   loss_type);
+
+    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
+    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
+
+    // set gradients to zero, initilize loss, and optionally reset the optimizer
+    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
+
+    // get underlying tensors that store data
+    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
+    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
+    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
+    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
+    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
+    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
+
+    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
+
+    // ====== Optimization Result ======
+
+    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
+    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
+
+    // get data from result, uncertainties are optional and can be ignored by passing NULL
+    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
+    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
+    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
+    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
+
+    // ====== Computation ======
+
+    // do forward pass, increment result if not NULL
+    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // do forward pass, increment result if not NULL, do backward pass
+    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // ############################################################################
+    // ## The high-level functions start here. They do not depend on any private ##
+    // ## functions or structs and can be copied to and adapted for user code.   ##
+    // ############################################################################
+
+    // ====== Intended Usage ======
+    //
+    // 1. Select the appropriate loss for your problem.
+    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
+    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
+    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
+    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
+    //    The second context should contain all other tensors and will be (re)allocated automatically.
+    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
+    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
+    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
+
+    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
+    typedef void (*ggml_opt_epoch_callback)(
+            bool               train,       // true after training evaluation, false after validation evaluation
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,      // result associated with the dataset subsection
+            int64_t            ibatch,      // number of batches that have been evaluated so far
+            int64_t            ibatch_max,  // total number of batches in this dataset subsection
+            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
+
+    // do training on front of dataset, do evaluation only on back of dataset
+    GGML_API void ggml_opt_epoch(
+            ggml_opt_context_t      opt_ctx,
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
+            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
+            int64_t                 idata_split,    // data index at which to split training and evaluation
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    // callback that prints a progress bar on stderr
+    GGML_API void ggml_opt_epoch_callback_progress_bar(
+            bool               train,
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,
+            int64_t            ibatch,
+            int64_t            ibatch_max,
+            int64_t            t_start_us);
+
+    // fit model defined by inputs and outputs to dataset
+    GGML_API void ggml_opt_fit(
+            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
+            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
+            enum ggml_opt_loss_type         loss_type,      // loss to minimize
+            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
+            int64_t                         nepoch,         // how many times the dataset should be iterated over
+            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
+            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
+            bool                            silent);        // whether or not info prints to stderr should be suppressed
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ml/backend/ggml/ggml/include/ggml-rpc.h b/ml/backend/ggml/ggml/include/ggml-rpc.h
new file mode 100644
index 000000000..ade6c3b0e
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-rpc.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_RPC_MAX_SERVERS       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
+
+GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ml/backend/ggml/ggml/include/ggml-sycl.h b/ml/backend/ggml/ggml/include/ggml-sycl.h
new file mode 100644
index 000000000..5ce349a88
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-sycl.h
@@ -0,0 +1,49 @@
+//
+//  MIT license
+//  Copyright (C) 2024 Intel Corporation
+//  SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#define GGML_SYCL_NAME "SYCL"
+#define GGML_SYCL_MAX_DEVICES 48
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+
+// devide buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+
+GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
+                                                       char *description,
+                                                       size_t description_size);
+GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
+GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+
+// SYCL doesn't support registering host memory, keep here for reference
+// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ml/backend/ggml/ggml/include/ggml-vulkan.h b/ml/backend/ggml/ggml/include/ggml-vulkan.h
new file mode 100644
index 000000000..53cdba072
--- /dev/null
+++ b/ml/backend/ggml/ggml/include/ggml-vulkan.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_VK_NAME "Vulkan"
+#define GGML_VK_MAX_DEVICES 16
+
+GGML_BACKEND_API void ggml_vk_instance_init(void);
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+
+GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/llama/ggml.h b/ml/backend/ggml/ggml/include/ggml.h
similarity index 98%
rename from llama/ggml.h
rename to ml/backend/ggml/ggml/include/ggml.h
index 621362c83..1bc50fcae 100644
--- a/llama/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 //
diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt
new file mode 100644
index 000000000..72b488dd8
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -0,0 +1,340 @@
+include(CheckCXXCompilerFlag)
+
+add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
+
+# enable libstdc++ assertions for debug builds
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
+endif()
+
+if (NOT MSVC)
+    if (GGML_SANITIZE_THREAD)
+        add_compile_options(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
+    endif()
+
+    if (GGML_SANITIZE_ADDRESS)
+        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries     (-fsanitize=address)
+    endif()
+
+    if (GGML_SANITIZE_UNDEFINED)
+        add_compile_options(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
+    endif()
+endif()
+
+function(ggml_get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")
+
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
+
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            list(APPEND C_FLAGS -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    endif()
+
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()
+
+if (GGML_FATAL_WARNINGS)
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        list(APPEND C_FLAGS   -Werror)
+        list(APPEND CXX_FLAGS -Werror)
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        add_compile_options(/WX)
+    endif()
+endif()
+
+if (GGML_ALL_WARNINGS)
+    if (NOT MSVC)
+        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                  -Werror=implicit-int -Werror=implicit-function-declaration)
+        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+
+        list(APPEND C_FLAGS   ${WARNING_FLAGS})
+        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+        ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+    else()
+        # todo : msvc
+        set(C_FLAGS   "")
+        set(CXX_FLAGS "")
+    endif()
+endif()
+
+if (GGML_LTO)
+    include(CheckIPOSupported)
+    check_ipo_supported(RESULT result OUTPUT output)
+    if (result)
+        set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
+    else()
+        message(WARNING "IPO is not supported: ${output}")
+    endif()
+endif()
+
+if (GGML_CCACHE)
+    find_program(GGML_CCACHE_FOUND ccache)
+
+    if (GGML_CCACHE_FOUND)
+        # TODO: should not be set globally
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+        set(ENV{CCACHE_SLOPPINESS} time_macros)
+        message(STATUS "ccache found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
+    else()
+        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF")
+    endif ()
+endif()
+
+# this version of Apple ld64 is buggy
+execute_process(
+    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
+    ERROR_VARIABLE output
+    OUTPUT_QUIET
+)
+
+if (output MATCHES "dyld-1015\.7")
+    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
+endif()
+
+# architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+if (MSVC)
+    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
+    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+else ()
+    set(CMAKE_GENERATOR_PLATFORM_LWR "")
+endif ()
+
+if (NOT MSVC)
+    if (GGML_STATIC)
+        add_link_options(-static)
+        if (MINGW)
+            add_link_options(-static-libgcc -static-libstdc++)
+        endif()
+    endif()
+    if (GGML_GPROF)
+        add_compile_options(-pg)
+    endif()
+endif()
+
+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
+#
+# POSIX conformance
+#
+
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_XOPEN_SOURCE=700)
+else()
+    add_compile_definitions(_XOPEN_SOURCE=600)
+endif()
+
+# Data types, macros and functions related to controlling CPU affinity and
+# some memory allocation are available on Linux through GNU extensions in libc
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
+    add_compile_definitions(_GNU_SOURCE)
+endif()
+
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+if (
+    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
+    CMAKE_SYSTEM_NAME MATCHES "iOS"    OR
+    CMAKE_SYSTEM_NAME MATCHES "tvOS"   OR
+    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
+)
+    add_compile_definitions(_DARWIN_C_SOURCE)
+endif()
+
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    add_compile_definitions(__BSD_VISIBLE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    add_compile_definitions(_NETBSD_SOURCE)
+endif()
+if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
+    add_compile_definitions(_BSD_SOURCE)
+endif()
+
+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
+# ggml
+
+if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
+endif()
+
+add_library(ggml-base
+            ../include/ggml.h
+            ../include/ggml-alloc.h
+            ../include/ggml-backend.h
+            ../include/ggml-cpp.h
+            ../include/ggml-opt.h
+            ggml.c
+            ggml-alloc.c
+            ggml-backend.cpp
+            ggml-opt.cpp
+            ggml-threading.cpp
+            ggml-threading.h
+            ggml-quants.c
+            ggml-quants.h)
+
+target_include_directories(ggml-base PRIVATE .)
+
+add_library(ggml
+            ggml-backend-reg.cpp)
+
+target_link_libraries(ggml PUBLIC ggml-base)
+
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+    target_link_libraries(ggml PRIVATE dl)
+endif()
+
+function(ggml_add_backend_library backend)
+    if (GGML_BACKEND_DL)
+        add_library(${backend} MODULE ${ARGN})
+        # write the shared library to the output directory
+        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
+        add_dependencies(ggml ${backend})
+    else()
+        add_library(${backend} ${ARGN})
+        target_link_libraries(ggml PUBLIC ${backend})
+        install(TARGETS ${backend} LIBRARY)
+    endif()
+
+    target_link_libraries(${backend} PRIVATE ggml-base)
+    target_include_directories(${backend} PRIVATE ..)
+
+    if (${BUILD_SHARED_LIBS})
+        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
+        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
+    endif()
+endfunction()
+
+function(ggml_add_backend backend)
+    string(TOUPPER "GGML_${backend}" backend_id)
+    if (${backend_id})
+        string(TOLOWER "ggml-${backend}" backend_target)
+        add_subdirectory(${backend_target})
+        message(STATUS "Including ${backend} backend")
+        if (NOT GGML_BACKEND_DL)
+            string(TOUPPER "GGML_USE_${backend}" backend_use)
+            target_compile_definitions(ggml PUBLIC ${backend_use})
+        endif()
+    endif()
+endfunction()
+
+function(ggml_add_cpu_backend_variant tag_name)
+    set(GGML_CPU_TAG_NAME ${tag_name})
+    # other: OPENMP LLAMAFILE CPU_HBM
+    foreach (feat NATIVE
+                  AVX AVX2 AVX_VNNI FMA F16C
+                  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
+                  AMX_TILE AMX_INT8 AMX_BF16)
+        set(GGML_${feat} OFF)
+    endforeach()
+
+    foreach (feat ${ARGN})
+        set(GGML_${feat} ON)
+    endforeach()
+
+    ggml_add_cpu_backend_variant_impl(${tag_name})
+    add_dependencies(ggml-cpu ggml-cpu-${tag_name})
+endfunction()
+
+ggml_add_backend(CPU)
+
+if (GGML_CPU_ALL_VARIANTS)
+    if (NOT GGML_BACKEND_DL)
+        message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
+    endif()
+    add_custom_target(ggml-cpu)
+    ggml_add_cpu_backend_variant(sandybridge    AVX)
+    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
+    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+    if (NOT MSVC)
+        # MSVC doesn't support AMX
+        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+    endif()
+else ()
+    ggml_add_cpu_backend_variant_impl("")
+endif()
+
+ggml_add_backend(BLAS)
+ggml_add_backend(CANN)
+ggml_add_backend(CUDA)
+ggml_add_backend(HIP)
+ggml_add_backend(Kompute)
+ggml_add_backend(METAL)
+ggml_add_backend(MUSA)
+ggml_add_backend(RPC)
+ggml_add_backend(SYCL)
+ggml_add_backend(Vulkan)
+ggml_add_backend(OpenCL)
+
+foreach (target ggml-base ggml)
+    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
+    target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
+endforeach()
+
+target_link_libraries(ggml-base PRIVATE Threads::Threads)
+
+find_library(MATH_LIBRARY m)
+if (MATH_LIBRARY)
+    if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
+        target_link_libraries(ggml-base PRIVATE m)
+    endif()
+endif()
+
+if (CMAKE_SYSTEM_NAME MATCHES "Android")
+    target_link_libraries(ggml-base PRIVATE dl)
+endif()
+
+if (BUILD_SHARED_LIBS)
+    foreach (target ggml-base ggml)
+        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_compile_definitions(${target} PRIVATE GGML_BUILD)
+        target_compile_definitions(${target} PUBLIC  GGML_SHARED)
+    endforeach()
+endif()
diff --git a/llama/ggml-alloc.c b/ml/backend/ggml/ggml/src/ggml-alloc.c
similarity index 96%
rename from llama/ggml-alloc.c
rename to ml/backend/ggml/ggml/src/ggml-alloc.c
index 6ea83a901..8dc8226ac 100644
--- a/llama/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "ggml-alloc.h"
 #include "ggml-backend-impl.h"
 #include "ggml.h"
diff --git a/llama/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
similarity index 90%
rename from llama/ggml-backend-impl.h
rename to ml/backend/ggml/ggml/src/ggml-backend-impl.h
index 37b592077..36d72e95f 100644
--- a/llama/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 // ggml-backend internal header
diff --git a/llama/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
similarity index 90%
rename from llama/ggml-backend-reg.cpp
rename to ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
index 2ebc34398..ac5cda072 100644
--- a/llama/ggml-backend-reg.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 #include "ggml-impl.h"
@@ -176,7 +150,7 @@ struct ggml_backend_reg_entry {
 
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_entry> backends;
-    std::vector<ggml_backend_dev_t> devices;
+    std::vector<std::pair<ggml_backend_dev_t, int>> devices;
 
     ggml_backend_registry() {
 #ifdef GGML_USE_CUDA
@@ -221,7 +195,7 @@ struct ggml_backend_registry {
         }
     }
 
-    void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
+    void register_backend(ggml_backend_reg_t reg, int score = -1, dl_handle_ptr handle = nullptr) {
         if (!reg) {
             return;
         }
@@ -232,15 +206,15 @@ struct ggml_backend_registry {
 #endif
         backends.push_back({ reg, std::move(handle) });
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
-            register_device(ggml_backend_reg_dev_get(reg, i));
+            register_device(ggml_backend_reg_dev_get(reg, i), score);
         }
     }
 
-    void register_device(ggml_backend_dev_t device) {
+    void register_device(ggml_backend_dev_t device, int score = -1) {
 #ifndef NDEBUG
         GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
 #endif
-        devices.push_back(device);
+        devices.push_back({device, score});
     }
 
     ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
@@ -283,7 +257,7 @@ struct ggml_backend_registry {
 
         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
 
-        register_backend(reg, std::move(handle));
+        register_backend(reg, score_fn ? score_fn() : -1, std::move(handle));
 
         return reg;
     }
@@ -306,7 +280,7 @@ struct ggml_backend_registry {
         // remove devices
         devices.erase(
             std::remove_if(devices.begin(), devices.end(),
-                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+                            [reg](std::pair<ggml_backend_dev_t, int> dev) { return ggml_backend_dev_backend_reg(dev.first) == reg; }),
             devices.end());
 
         // remove backend
@@ -364,7 +338,12 @@ size_t ggml_backend_dev_count() {
 
 ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
     GGML_ASSERT(index < ggml_backend_dev_count());
-    return get_reg().devices[index];
+    auto devices = get_reg().devices;
+    if (!std::is_heap(devices.begin(), devices.end())) {
+        std::make_heap(devices.begin(), devices.end(), [](const auto & a, const auto & b) { return a.second < b.second; });
+    }
+
+    return devices[index].first;
 }
 
 ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
diff --git a/llama/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp
similarity index 98%
rename from llama/ggml-backend.cpp
rename to ml/backend/ggml/ggml/src/ggml-backend.cpp
index 3e11d73fc..a12172dcd 100644
--- a/llama/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // Note: porting this file to C++ is a work in progress
 
 #ifdef _WIN32
@@ -132,12 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer->iface.free_buffer != NULL) {
         buffer->iface.free_buffer(buffer);
     }
-
-// TODO: this needs to be freed in cuda and hip backends because
-// the cuda backend implementation compiled with msvc
-#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP)
-    delete buffer;
-#endif
 }
 
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
diff --git a/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt
new file mode 100644
index 000000000..0bf3c05d9
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt
@@ -0,0 +1,87 @@
+if (GGML_STATIC)
+    set(BLA_STATIC ON)
+endif()
+#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+#    set(BLA_SIZEOF_INTEGER 8)
+#endif()
+
+set(BLA_VENDOR ${GGML_BLAS_VENDOR})
+find_package(BLAS)
+
+if (BLAS_FOUND)
+    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
+
+    ggml_add_backend_library(ggml-blas
+                             ggml-blas.cpp
+                            )
+
+    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
+    elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
+        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
+        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
+        find_package(PkgConfig REQUIRED)
+        if (${GGML_BLAS_VENDOR} MATCHES "Generic")
+            pkg_check_modules(DepBLAS blas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
+            # As of openblas v0.3.22, the 64-bit is named openblas64.pc
+            pkg_check_modules(DepBLAS openblas64)
+            if (NOT DepBLAS_FOUND)
+                pkg_check_modules(DepBLAS openblas)
+            endif()
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
+            add_compile_definitions(GGML_BLAS_USE_BLIS)
+            pkg_check_modules(DepBLAS blis)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
+            pkg_check_modules(DepBLAS blas-atlas)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
+            pkg_check_modules(DepBLAS flexiblas_api)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+            add_compile_definitions(GGML_BLAS_USE_MKL)
+            # all Intel* libraries share the same include path
+            pkg_check_modules(DepBLAS mkl-sdl)
+        elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
+            # this doesn't provide pkg-config
+            # suggest to assign BLAS_INCLUDE_DIRS on your own
+            if ("${NVHPC_VERSION}" STREQUAL "")
+                message(WARNING "Better to set NVHPC_VERSION")
+            else()
+                set(DepBLAS_FOUND ON)
+                set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
+            endif()
+        endif()
+        if (DepBLAS_FOUND)
+            set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
+        else()
+            message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
+            " detected by pkgconfig, trying to find cblas.h from possible paths...")
+            find_path(BLAS_INCLUDE_DIRS
+                NAMES cblas.h
+                HINTS
+                    /usr/include
+                    /usr/local/include
+                    /usr/include/openblas
+                    /opt/homebrew/opt/openblas/include
+                    /usr/local/opt/openblas/include
+                    /usr/include/x86_64-linux-gnu/openblas/include
+            )
+        endif()
+    endif()
+
+    message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+
+    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
+
+    if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+        add_compile_definitions(GGML_BLAS_USE_MKL)
+    endif()
+
+    target_link_libraries     (ggml-blas PRIVATE ${BLAS_LIBRARIES})
+    target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
+else()
+    message(ERROR "BLAS not found, please refer to "
+                  "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+                  " to set correct GGML_BLAS_VENDOR")
+endif()
diff --git a/ml/backend/ggml/ggml/src/ggml-blas/blas.go b/ml/backend/ggml/ggml/src/ggml-blas/blas.go
new file mode 100644
index 000000000..b29c9f140
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-blas/blas.go
@@ -0,0 +1,10 @@
+//go:build darwin && arm64
+
+package blas
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo CPPFLAGS: -DGGML_USE_BLAS
+// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../include
+// #cgo darwin,arm64 CPPFLAGS: -DGGML_BLAS_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
+import "C"
diff --git a/llama/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
similarity index 92%
rename from llama/ggml-blas.cpp
rename to ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
index 44acf0bd4..ec158dfac 100644
--- a/llama/ggml-blas.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
@@ -1,31 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifdef GGML_USE_BLAS
-
 #include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
@@ -543,5 +515,3 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
 }
 
 GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
-
-#endif // GGML_USE_BLAS
\ No newline at end of file
diff --git a/llama/ggml-common.h b/ml/backend/ggml/ggml/src/ggml-common.h
similarity index 99%
rename from llama/ggml-common.h
rename to ml/backend/ggml/ggml/src/ggml-common.h
index e227c13fb..f13fd4dea 100644
--- a/llama/ggml-common.h
+++ b/ml/backend/ggml/ggml/src/ggml-common.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef GGML_COMMON_DECL
 
 #if defined(GGML_COMMON_DECL_C)
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
new file mode 100644
index 000000000..6b3641c42
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt
@@ -0,0 +1,346 @@
+function(ggml_add_cpu_backend_variant_impl tag_name)
+    if (tag_name)
+        set(GGML_CPU_NAME ggml-cpu-${tag_name})
+    else()
+        set(GGML_CPU_NAME ggml-cpu)
+    endif()
+
+    ggml_add_backend_library(${GGML_CPU_NAME})
+
+    list (APPEND GGML_CPU_SOURCES
+        ggml-cpu/ggml-cpu.c
+        ggml-cpu/ggml-cpu.cpp
+        ggml-cpu/ggml-cpu-aarch64.cpp
+        ggml-cpu/ggml-cpu-aarch64.h
+        ggml-cpu/ggml-cpu-hbm.cpp
+        ggml-cpu/ggml-cpu-hbm.h
+        ggml-cpu/ggml-cpu-quants.c
+        ggml-cpu/ggml-cpu-quants.h
+        ggml-cpu/ggml-cpu-traits.cpp
+        ggml-cpu/ggml-cpu-traits.h
+        ggml-cpu/amx/amx.cpp
+        ggml-cpu/amx/amx.h
+        ggml-cpu/amx/mmq.cpp
+        ggml-cpu/amx/mmq.h
+        ggml-cpu/ggml-cpu-impl.h
+        )
+
+    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
+    target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
+
+    if (APPLE AND GGML_ACCELERATE)
+        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        if (ACCELERATE_FRAMEWORK)
+            message(STATUS "Accelerate framework found")
+
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK})
+        else()
+            message(WARNING "Accelerate framework not found")
+        endif()
+    endif()
+
+    if (GGML_OPENMP)
+        find_package(OpenMP)
+        if (OpenMP_FOUND)
+            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
+
+            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        else()
+            message(WARNING "OpenMP not found")
+        endif()
+    endif()
+
+    if (GGML_LLAMAFILE)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE)
+
+        list(APPEND GGML_CPU_SOURCES
+                    ggml-cpu/llamafile/sgemm.cpp
+                    ggml-cpu/llamafile/sgemm.h)
+    endif()
+
+    if (GGML_CPU_HBM)
+        find_library(memkind memkind REQUIRED)
+
+        message(STATUS "Using memkind for CPU HBM")
+
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM)
+
+        target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
+    endif()
+
+    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+
+        message(STATUS "ARM detected")
+
+        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
+            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
+        else()
+            check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+            if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+            endif()
+
+            if (GGML_NATIVE)
+                # -mcpu=native does not always enable all the features in some compilers,
+                # so we check for them manually and enable them if available
+
+                execute_process(
+                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
+                    INPUT_FILE "/dev/null"
+                    OUTPUT_QUIET
+                    ERROR_VARIABLE ARM_MCPU
+                    RESULT_VARIABLE ARM_MCPU_RESULT
+                )
+                if (NOT ARM_MCPU_RESULT)
+                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
+                endif()
+                if ("${ARM_MCPU_FLAG}" STREQUAL "")
+                    set(ARM_MCPU_FLAG -mcpu=native)
+                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
+                endif()
+
+                include(CheckCXXSourceRuns)
+
+                function(check_arm_feature tag code)
+                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+                    set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
+                    check_cxx_source_runs(
+                        "${code}"
+                        GGML_MACHINE_SUPPORTS_${tag}
+                    )
+                    if (GGML_MACHINE_SUPPORTS_${tag})
+                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
+                    else()
+                        set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
+                    endif()
+                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+                endfunction()
+
+                check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(i8mm    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
+                check_arm_feature(sve     "#include <arm_sve.h>\nint main()  { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
+
+                list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
+            else()
+                if (GGML_CPU_ARM_ARCH)
+                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
+                endif()
+            endif()
+
+            # show enabled features
+            if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+                set(FEAT_INPUT_FILE "NUL")
+            else()
+                set(FEAT_INPUT_FILE "/dev/null")
+            endif()
+
+            execute_process(
+                COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
+                INPUT_FILE ${FEAT_INPUT_FILE}
+                OUTPUT_VARIABLE ARM_FEATURE
+                RESULT_VARIABLE ARM_FEATURE_RESULT
+            )
+            if (ARM_FEATURE_RESULT)
+                message(WARNING "Failed to get ARM features")
+            else()
+                foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
+                    string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
+                    if (NOT ${feature_pos} EQUAL -1)
+                        message(STATUS "ARM feature ${feature} enabled")
+                    endif()
+                endforeach()
+            endif()
+        endif()
+    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
+
+        message(STATUS "x86 detected")
+
+        if (MSVC)
+            # instruction set detection for MSVC only
+            if (GGML_NATIVE)
+                include(ggml-cpu/cmake/FindSIMD.cmake)
+            endif ()
+            if (GGML_AVX512)
+                list(APPEND ARCH_FLAGS /arch:AVX512)
+                # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
+                # MSVC has no compile-time flags enabling specific
+                # AVX512 extensions, neither it defines the
+                # macros corresponding to the extensions.
+                # Do it manually.
+                list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    endif()
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512vnni)
+                    endif()
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16)
+                    if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                        list(APPEND ARCH_FLAGS -mavx512bf16)
+                    endif()
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16)
+                endif()
+            elseif (GGML_AVX2)
+                list(APPEND ARCH_FLAGS /arch:AVX2)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C)
+            elseif (GGML_AVX)
+                list(APPEND ARCH_FLAGS /arch:AVX)
+                list(APPEND ARCH_DEFINITIONS GGML_AVX)
+            else ()
+                list(APPEND ARCH_FLAGS /arch:SSE4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+            endif()
+            if (GGML_AVX_VNNI)
+                list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
+            endif()
+        else ()
+            if (GGML_NATIVE)
+                list(APPEND ARCH_FLAGS -march=native)
+            else ()
+                list(APPEND ARCH_FLAGS -msse4.2)
+                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+                if (GGML_F16C)
+                    list(APPEND ARCH_FLAGS -mf16c)
+                    list(APPEND ARCH_DEFINITIONS GGML_F16C)
+                endif()
+                if (GGML_FMA)
+                    list(APPEND ARCH_FLAGS -mfma)
+                    list(APPEND ARCH_DEFINITIONS GGML_FMA)
+                endif()
+                if (GGML_AVX)
+                    list(APPEND ARCH_FLAGS -mavx)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX)
+                endif()
+                if (GGML_AVX2)
+                    list(APPEND ARCH_FLAGS -mavx2)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX2)
+                endif()
+                if (GGML_AVX_VNNI)
+                    list(APPEND ARCH_FLAGS -mavxvnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI)
+                endif()
+                if (GGML_AVX512)
+                    list(APPEND ARCH_FLAGS -mavx512f)
+                    list(APPEND ARCH_FLAGS -mavx512cd)
+                    list(APPEND ARCH_FLAGS -mavx512vl)
+                    list(APPEND ARCH_FLAGS -mavx512dq)
+                    list(APPEND ARCH_FLAGS -mavx512bw)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512)
+                endif()
+                if (GGML_AVX512_VBMI)
+                    list(APPEND ARCH_FLAGS -mavx512vbmi)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI)
+                endif()
+                if (GGML_AVX512_VNNI)
+                    list(APPEND ARCH_FLAGS -mavx512vnni)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI)
+                endif()
+                if (GGML_AVX512_BF16)
+                    list(APPEND ARCH_FLAGS -mavx512bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16)
+                endif()
+                if (GGML_AMX_TILE)
+                    list(APPEND ARCH_FLAGS -mamx-tile)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE)
+                endif()
+                if (GGML_AMX_INT8)
+                    list(APPEND ARCH_FLAGS -mamx-int8)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8)
+                endif()
+                if (GGML_AMX_BF16)
+                    list(APPEND ARCH_FLAGS -mamx-bf16)
+                    list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16)
+                endif()
+            endif()
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+        message(STATUS "PowerPC detected")
+        execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
+        string(FIND "${POWER10_M}" "POWER10" substring_index)
+        if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
+            set(substring_index -1)
+        endif()
+
+        if (${substring_index} GREATER_EQUAL 0)
+        list(APPEND ARCH_FLAGS -mcpu=power10)
+        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+        else()
+            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+            # TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+        message(STATUS "loongarch64 detected")
+
+        list(APPEND ARCH_FLAGS -march=loongarch64)
+        if (GGML_LASX)
+            list(APPEND ARCH_FLAGS -mlasx)
+        endif()
+        if (GGML_LSX)
+            list(APPEND ARCH_FLAGS -mlsx)
+        endif()
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
+        message(STATUS "RISC-V detected")
+        if (GGML_RVV)
+            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+        endif()
+    else()
+        message(STATUS "Unknown architecture")
+    endif()
+
+    if (GGML_CPU_AARCH64)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
+    endif()
+
+    message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
+    target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
+    target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
+    target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
+
+    if (GGML_BACKEND_DL)
+        if (GGML_NATIVE)
+            # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
+            message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
+        endif()
+
+        # The feature detection code is compiled as a separate target so that
+        # it can be built without the architecture flags
+        # Since multiple variants of the CPU backend may be included in the same
+        # build, using set_source_files_properties() to set the arch flags is not possible
+        set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
+        add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
+        target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
+        target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
+        set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
+    endif()
+
+    if (EMSCRIPTEN)
+        set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
+    endif()
+endfunction()
diff --git a/llama/amx.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp
similarity index 86%
rename from llama/amx.cpp
rename to ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp
index a2c7e8e5a..5ec5263ce 100644
--- a/llama/amx.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "amx.h"
 #include "common.h"
 #include "mmq.h"
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h
new file mode 100644
index 000000000..5b65d76bd
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h
@@ -0,0 +1,8 @@
+#include "ggml-backend.h"
+#include "ggml-cpu-impl.h"
+
+// GGML internal header
+
+#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
+ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+#endif
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h b/ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h
new file mode 100644
index 000000000..f392e8985
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-impl.h"
+
+#include <algorithm>
+#include <memory>
+#include <type_traits>
+
+#if defined(GGML_USE_OPENMP)
+#include <omp.h>
+#endif
+
+#define TILE_M 16
+#define TILE_N 16
+#define TILE_K 32
+#define VNNI_BLK 4
+
+#define AMX_BLK_SIZE 32
+
+#define TMM0 0
+#define TMM1 1
+#define TMM2 2
+#define TMM3 3
+#define TMM4 4
+#define TMM5 5
+#define TMM6 6
+#define TMM7 7
+
+// parallel routines
+template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+inline T div_up(T x, T y) { return (x + y - 1) / y; }
+
+template <typename T>
+inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
+#if 0
+    // onednn partition pattern
+    T& n_my = n_end;
+    if (nth <= 1 || n == 0) {
+        n_start = 0;
+        n_my = n;
+    } else {
+        T n1 = div_up(n, nth);
+        T n2 = n1 - 1;
+        T T1 = n - n2 * nth;
+        n_my = ith < T1 ? n1 : n2;
+        n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
+    }
+    n_end += n_start;
+#else
+    // pytorch aten partition pattern
+    T n_my = div_up(n, nth);
+    n_start = ith * n_my;
+    n_end = std::min(n_start + n_my, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for(int n, const func_t& f) {
+#if defined(GGML_USE_OPENMP)
+#pragma omp parallel
+{
+    int nth = omp_get_num_threads();
+    int ith = omp_get_thread_num();
+    int tbegin, tend;
+    balance211(n, nth, ith, tbegin, tend);
+    f(tbegin, tend);
+}
+#else
+    f(0, n);
+#endif
+}
+
+template <typename func_t>
+inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
+    int tbegin, tend;
+    balance211(n, params->nth, params->ith, tbegin, tend);
+    f(tbegin, tend);
+}
+
+// quantized types that have AMX support
+inline bool qtype_has_amx_kernels(const enum ggml_type type) {
+    // TODO: fix padding for vnni format
+    return (type == GGML_TYPE_Q4_0) ||
+        (type == GGML_TYPE_Q4_1) ||
+        (type == GGML_TYPE_Q8_0) ||
+        (type == GGML_TYPE_Q4_K) ||
+        (type == GGML_TYPE_Q5_K) ||
+        (type == GGML_TYPE_Q6_K) ||
+        (type == GGML_TYPE_IQ4_XS);
+}
diff --git a/llama/mmq.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.cpp
similarity index 98%
rename from llama/mmq.cpp
rename to ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.cpp
index bb20e999c..0ea91596b 100644
--- a/llama/mmq.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wpedantic"
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h
new file mode 100644
index 000000000..baf768477
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h
@@ -0,0 +1,10 @@
+#pragma once
+#include "common.h"
+
+size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
+
+size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
+
+void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+
+void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
new file mode 100644
index 000000000..e8133d411
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@@ -0,0 +1,323 @@
+#include "ggml-backend-impl.h"
+
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstring>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+
+// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
+struct cpuid_x86 {
+    bool SSE3(void) { return f_1_ecx[0]; }
+    bool PCLMULQDQ(void) { return f_1_ecx[1]; }
+    bool MONITOR(void) { return f_1_ecx[3]; }
+    bool SSSE3(void) { return f_1_ecx[9]; }
+    bool FMA(void) { return f_1_ecx[12]; }
+    bool CMPXCHG16B(void) { return f_1_ecx[13]; }
+    bool SSE41(void) { return f_1_ecx[19]; }
+    bool SSE42(void) { return f_1_ecx[20]; }
+    bool MOVBE(void) { return f_1_ecx[22]; }
+    bool POPCNT(void) { return f_1_ecx[23]; }
+    bool AES(void) { return f_1_ecx[25]; }
+    bool XSAVE(void) { return f_1_ecx[26]; }
+    bool OSXSAVE(void) { return f_1_ecx[27]; }
+    bool AVX(void) { return f_1_ecx[28]; }
+    bool F16C(void) { return f_1_ecx[29]; }
+    bool RDRAND(void) { return f_1_ecx[30]; }
+
+    bool MSR(void) { return f_1_edx[5]; }
+    bool CX8(void) { return f_1_edx[8]; }
+    bool SEP(void) { return f_1_edx[11]; }
+    bool CMOV(void) { return f_1_edx[15]; }
+    bool CLFSH(void) { return f_1_edx[19]; }
+    bool MMX(void) { return f_1_edx[23]; }
+    bool FXSR(void) { return f_1_edx[24]; }
+    bool SSE(void) { return f_1_edx[25]; }
+    bool SSE2(void) { return f_1_edx[26]; }
+
+    bool FSGSBASE(void) { return f_7_ebx[0]; }
+    bool BMI1(void) { return f_7_ebx[3]; }
+    bool HLE(void) { return is_intel && f_7_ebx[4]; }
+    bool AVX2(void) { return f_7_ebx[5]; }
+    bool BMI2(void) { return f_7_ebx[8]; }
+    bool ERMS(void) { return f_7_ebx[9]; }
+    bool INVPCID(void) { return f_7_ebx[10]; }
+    bool RTM(void) { return is_intel && f_7_ebx[11]; }
+    bool AVX512F(void) { return f_7_ebx[16]; }
+    bool AVX512DQ(void) { return f_7_ebx[17]; }
+    bool RDSEED(void) { return f_7_ebx[18]; }
+    bool ADX(void) { return f_7_ebx[19]; }
+    bool AVX512PF(void) { return f_7_ebx[26]; }
+    bool AVX512ER(void) { return f_7_ebx[27]; }
+    bool AVX512CD(void) { return f_7_ebx[28]; }
+    bool AVX512BW(void) { return f_7_ebx[30]; }
+    bool AVX512VL(void) { return f_7_ebx[31]; }
+
+    bool SHA(void) { return f_7_ebx[29]; }
+
+    bool PREFETCHWT1(void) { return f_7_ecx[0]; }
+
+    bool LAHF(void) { return f_81_ecx[0]; }
+    bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
+    bool ABM(void) { return is_amd && f_81_ecx[5]; }
+    bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
+    bool XOP(void) { return is_amd && f_81_ecx[11]; }
+    bool TBM(void) { return is_amd && f_81_ecx[21]; }
+
+    bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
+    bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
+    bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
+    bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
+    bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
+
+    bool AVX512_VBMI(void) { return f_7_ecx[1]; }
+    bool AVX512_VNNI(void) { return f_7_ecx[11]; }
+    bool AVX512_FP16(void) { return f_7_edx[23]; }
+    bool AVX512_BF16(void) { return f_7_1_eax[5]; }
+    bool AVX_VNNI(void) { return f_7_1_eax[4]; }
+
+    bool AMX_TILE(void) { return f_7_edx[24]; }
+    bool AMX_INT8(void) { return f_7_edx[25]; }
+    bool AMX_FP16(void) { return f_7_1_eax[21]; }
+    bool AMX_BF16(void) { return f_7_edx[22]; }
+
+#ifdef _MSC_VER
+    static void cpuid(int cpu_info[4], int eax) {
+        __cpuid(cpu_info, eax);
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __cpuidex(cpu_info, eax, ecx);
+    }
+#else
+    static void cpuid(int cpu_info[4], int eax) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(0));
+    }
+    static void cpuidex(int cpu_info[4], int eax, int ecx) {
+        __asm__ __volatile__(
+            "cpuid"
+            : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+            : "a"(eax), "c"(ecx));
+    }
+#endif
+
+    cpuid_x86() {
+        std::array<int, 4> cpui;
+        std::vector<std::array<int, 4>> data;
+
+        // calling __cpuid with 0x0 as the function_id argument
+        // gets the number of the highest valid function ID.
+        cpuid(cpui.data(), 0);
+        int n_ids = cpui[0];
+
+        for (int i = 0; i <= n_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            data.push_back(cpui);
+        }
+
+        // capture vendor string
+        char vendor[0x20] = {};
+        *reinterpret_cast<int *>(vendor)     = data[0][1];
+        *reinterpret_cast<int *>(vendor + 4) = data[0][3];
+        *reinterpret_cast<int *>(vendor + 8) = data[0][2];
+        this->vendor = vendor;
+        if (this->vendor == "GenuineIntel") {
+            is_intel = true;
+        } else if (this->vendor == "AuthenticAMD") {
+            is_amd = true;
+        }
+
+        // load bitset with flags for function 0x00000001
+        if (n_ids >= 1) {
+            f_1_ecx = data[1][2];
+            f_1_edx = data[1][3];
+        }
+
+        // load bitset with flags for function 0x00000007
+        if (n_ids >= 7) {
+            f_7_ebx = data[7][1];
+            f_7_ecx = data[7][2];
+            f_7_edx = data[7][3];
+            cpuidex(cpui.data(), 7, 1);
+            f_7_1_eax = cpui[0];
+        }
+
+        // calling __cpuid with 0x80000000 as the function_id argument
+        // gets the number of the highest valid extended ID.
+        cpuid(cpui.data(), 0x80000000);
+        unsigned int n_ex_ids = cpui[0];
+
+        std::vector<std::array<int, 4>> ext_data;
+        for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
+            cpuidex(cpui.data(), i, 0);
+            ext_data.push_back(cpui);
+        }
+
+        // load bitset with flags for function 0x80000001
+        if (n_ex_ids >= 0x80000001) {
+            f_81_ecx = ext_data[1][2];
+            f_81_edx = ext_data[1][3];
+        }
+
+        // interpret CPU brand string if reported
+        char brand[0x40] = {};
+        if (n_ex_ids >= 0x80000004) {
+            std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
+            std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
+            std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
+            this->brand = brand;
+        }
+    }
+
+    bool is_intel = false;
+    bool is_amd = false;
+    std::string vendor;
+    std::string brand;
+    std::bitset<32> f_1_ecx;
+    std::bitset<32> f_1_edx;
+    std::bitset<32> f_7_ebx;
+    std::bitset<32> f_7_ecx;
+    std::bitset<32> f_7_edx;
+    std::bitset<32> f_7_1_eax;
+    std::bitset<32> f_81_ecx;
+    std::bitset<32> f_81_edx;
+};
+
+#if 0
+void test_x86_is() {
+    cpuid_x86 is;
+    printf("CPU Vendor: %s\n", is.vendor.c_str());
+    printf("Brand: %s\n", is.brand.c_str());
+    printf("is_intel: %d\n", is.is_intel);
+    printf("is_amd: %d\n", is.is_amd);
+    printf("sse3: %d\n", is.SSE3());
+    printf("pclmulqdq: %d\n", is.PCLMULQDQ());
+    printf("ssse3: %d\n", is.SSSE3());
+    printf("fma: %d\n", is.FMA());
+    printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
+    printf("sse41: %d\n", is.SSE41());
+    printf("sse42: %d\n", is.SSE42());
+    printf("movbe: %d\n", is.MOVBE());
+    printf("popcnt: %d\n", is.POPCNT());
+    printf("aes: %d\n", is.AES());
+    printf("xsave: %d\n", is.XSAVE());
+    printf("osxsave: %d\n", is.OSXSAVE());
+    printf("avx: %d\n", is.AVX());
+    printf("f16c: %d\n", is.F16C());
+    printf("rdrand: %d\n", is.RDRAND());
+    printf("msr: %d\n", is.MSR());
+    printf("cx8: %d\n", is.CX8());
+    printf("sep: %d\n", is.SEP());
+    printf("cmov: %d\n", is.CMOV());
+    printf("clflush: %d\n", is.CLFSH());
+    printf("mmx: %d\n", is.MMX());
+    printf("fxsr: %d\n", is.FXSR());
+    printf("sse: %d\n", is.SSE());
+    printf("sse2: %d\n", is.SSE2());
+    printf("fsgsbase: %d\n", is.FSGSBASE());
+    printf("bmi1: %d\n", is.BMI1());
+    printf("hle: %d\n", is.HLE());
+    printf("avx2: %d\n", is.AVX2());
+    printf("bmi2: %d\n", is.BMI2());
+    printf("erms: %d\n", is.ERMS());
+    printf("invpcid: %d\n", is.INVPCID());
+    printf("rtm: %d\n", is.RTM());
+    printf("avx512f: %d\n", is.AVX512F());
+    printf("rdseed: %d\n", is.RDSEED());
+    printf("adx: %d\n", is.ADX());
+    printf("avx512pf: %d\n", is.AVX512PF());
+    printf("avx512er: %d\n", is.AVX512ER());
+    printf("avx512cd: %d\n", is.AVX512CD());
+    printf("sha: %d\n", is.SHA());
+    printf("prefetchwt1: %d\n", is.PREFETCHWT1());
+    printf("lahf: %d\n", is.LAHF());
+    printf("lzcnt: %d\n", is.LZCNT());
+    printf("abm: %d\n", is.ABM());
+    printf("sse4a: %d\n", is.SSE4a());
+    printf("xop: %d\n", is.XOP());
+    printf("tbm: %d\n", is.TBM());
+    printf("syscall: %d\n", is.SYSCALL());
+    printf("mmxext: %d\n", is.MMXEXT());
+    printf("rdtscp: %d\n", is.RDTSCP());
+    printf("3dnowext: %d\n", is._3DNOWEXT());
+    printf("3dnow: %d\n", is._3DNOW());
+    printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
+    printf("avx512_vnni: %d\n", is.AVX512_VNNI());
+    printf("avx512_fp16: %d\n", is.AVX512_FP16());
+    printf("avx512_bf16: %d\n", is.AVX512_BF16());
+    printf("amx_tile: %d\n", is.AMX_TILE());
+    printf("amx_int8: %d\n", is.AMX_INT8());
+    printf("amx_fp16: %d\n", is.AMX_FP16());
+    printf("amx_bf16: %d\n", is.AMX_BF16());
+}
+#endif
+
+static int ggml_backend_cpu_x86_score() {
+    // FIXME: this does not check for OS support
+
+    int score = 0;
+    cpuid_x86 is;
+
+#ifdef GGML_FMA
+    if (!is.FMA()) { return 0; }
+    score += 1;
+#endif
+#ifdef GGML_F16C
+    if (!is.F16C()) { return 0; }
+    score += 1<<1;
+#endif
+#ifdef GGML_SSE42
+    if (!is.SSE42()) { return 0; }
+    score += 1<<2;
+#endif
+#ifdef GGML_AVX
+    if (!is.AVX()) { return 0; }
+    score += 1<<4;
+#endif
+#ifdef GGML_AVX2
+    if (!is.AVX2()) { return 0; }
+    score += 1<<5;
+#endif
+#ifdef GGML_AVX_VNNI
+    if (!is.AVX_VNNI()) { return 0; }
+    score += 1<<6;
+#endif
+#ifdef GGML_AVX512
+    if (!is.AVX512F()) { return 0; }
+    if (!is.AVX512CD()) { return 0; }
+    if (!is.AVX512VL()) { return 0; }
+    if (!is.AVX512DQ()) { return 0; }
+    if (!is.AVX512BW()) { return 0; }
+    score += 1<<7;
+#endif
+#ifdef GGML_AVX512_VBMI
+    if (!is.AVX512_VBMI()) { return 0; }
+    score += 1<<8;
+#endif
+#ifdef GGML_AVX512_BF16
+    if (!is.AVX512_BF16()) { return 0; }
+    score += 1<<9;
+#endif
+#ifdef GGML_AVX512_VNNI
+    if (!is.AVX512_VNNI()) { return 0; }
+    score += 1<<10;
+#endif
+#ifdef GGML_AMX_INT8
+    if (!is.AMX_INT8()) { return 0; }
+    score += 1<<11;
+#endif
+
+    return score;
+}
+
+GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
+
+#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
new file mode 100644
index 000000000..f0bb54c23
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -0,0 +1,11 @@
+package cpu
+
+// #cgo CFLAGS: -Wno-implicit-function-declaration
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
+// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
+// #cgo linux CPPFLAGS: -D_GNU_SOURCE
+// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
+// #cgo darwin,arm64 LDFLAGS: -framework Accelerate
+import "C"
+import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu/llamafile"
diff --git a/llama/ggml-cpu-aarch64.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
similarity index 99%
rename from llama/ggml-cpu-aarch64.cpp
rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index 0989fb203..622c63f1f 100644
--- a/llama/ggml-cpu-aarch64.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #define GGML_COMMON_IMPL_CPP
 #define GGML_COMMON_DECL_CPP
 #include "ggml-common.h"
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
new file mode 100644
index 000000000..6e84c826b
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml-cpu-traits.h"
+#include "ggml.h"
+
+// GGML internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
new file mode 100644
index 000000000..fa8dea2af
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
@@ -0,0 +1,55 @@
+#ifdef GGML_USE_CPU_HBM
+
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+
+#include "ggml-cpu-hbm.h"
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buft);
+}
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                                           size_t                     size) {
+    void * ptr;
+    int    result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft                 = buft;
+    buffer->iface.free_buffer    = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+                           /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
+                           /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+                           /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+                           /* .get_max_size     = */ nullptr,  // defaults to SIZE_MAX
+                           /* .get_alloc_size   = */ nullptr,  // defaults to ggml_nbytes
+                           /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+                           },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.h
new file mode 100644
index 000000000..09a1f09d7
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-hbm.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+// GGML CPU internal header
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
diff --git a/llama/ggml-cpu-impl.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-impl.h
similarity index 87%
rename from llama/ggml-cpu-impl.h
rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 54dc108cc..d71076ad1 100644
--- a/llama/ggml-cpu-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 // GGML CPU internal header
diff --git a/llama/ggml-cpu-quants.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c
similarity index 99%
rename from llama/ggml-cpu-quants.c
rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c
index a8288deca..8e1472266 100644
--- a/llama/ggml-cpu-quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
diff --git a/llama/ggml-cpu-quants.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
similarity index 80%
rename from llama/ggml-cpu-quants.h
rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
index e2cdf03ed..e33d9d473 100644
--- a/llama/ggml-cpu-quants.h
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #define GGML_COMMON_DECL_C
diff --git a/llama/ggml-cpu-traits.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
similarity index 50%
rename from llama/ggml-cpu-traits.cpp
rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
index 6d7ca0246..62a0712da 100644
--- a/llama/ggml-cpu-traits.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "ggml-cpu-traits.h"
 
 #include "ggml-backend-impl.h"
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-traits.h
new file mode 100644
index 000000000..99a6186b1
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-traits.h
@@ -0,0 +1,38 @@
+#pragma once
+#include "ggml-backend-impl.h"
+#include "ggml-cpu-impl.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+#    include <vector>
+extern "C" {
+#endif
+
+// return true if op part of extra "accelerator"
+bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
+bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
+
+#ifdef __cplusplus
+}
+
+namespace ggml::cpu {
+// register in tensor->extra
+class tensor_traits {
+  public:
+    virtual ~tensor_traits();
+    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
+    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
+};
+
+class extra_buffer_type {
+  public:
+    virtual ~extra_buffer_type();
+    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
+    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
+};
+}  // namespace ggml::cpu
+
+// implemented in ggml-cpu.cpp.
+std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
+
+#endif
diff --git a/llama/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
similarity index 99%
rename from llama/ggml-cpu.c
rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
index 272f03e31..b307d5542 100644
--- a/llama/ggml-cpu.c
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
@@ -36,7 +10,7 @@
 #include "ggml-quants.h"
 #include "ggml-cpu-quants.h"
 #include "ggml-threading.h"
-#include "amx.h"
+#include "amx/amx.h"
 #include "ggml.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
diff --git a/llama/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
similarity index 94%
rename from llama/ggml-cpu.cpp
rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
index 38395101f..f11399cc6 100644
--- a/llama/ggml-cpu.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -1,36 +1,11 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-cpu-aarch64.h"
 #include "ggml-cpu-traits.h"
 #include "ggml-impl.h"
-#include "amx.h"
+#include "amx/amx.h"
+
 #include <cctype>
 #include <string>
 #include <vector>
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go
new file mode 100644
index 000000000..09b002ce5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go
@@ -0,0 +1,5 @@
+package llamafile
+
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../.. -I${SRCDIR}/../../../include
+import "C"
diff --git a/llama/sgemm.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp
similarity index 100%
rename from llama/sgemm.cpp
rename to ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp
diff --git a/llama/llamafile/sgemm.h b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.h
similarity index 100%
rename from llama/llamafile/sgemm.h
rename to ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.h
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
new file mode 100644
index 000000000..14761650f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt
@@ -0,0 +1,152 @@
+cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+
+find_package(CUDAToolkit)
+
+if (CUDAToolkit_FOUND)
+    message(STATUS "CUDA Toolkit found")
+
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # native == GPUs available at build time
+        # 52     == Maxwell, lowest CUDA 12 standard
+        # 60     == P100, FP16 CUDA intrinsics
+        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
+        # 70     == V100, FP16 tensor cores
+        # 75     == Turing, int8 tensor cores
+        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
+            set(CMAKE_CUDA_ARCHITECTURES "native")
+        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
+        endif()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
+    enable_language(CUDA)
+
+    file(GLOB   GGML_HEADERS_CUDA "*.cuh")
+    list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
+
+    file(GLOB   GGML_SOURCES_CUDA "*.cu")
+    file(GLOB   SRCS "template-instances/fattn-wmma*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    file(GLOB   SRCS "template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
+
+    if (GGML_CUDA_FA_ALL_QUANTS)
+        file(GLOB   SRCS "template-instances/fattn-vec*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+    else()
+        file(GLOB   SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB   SRCS "template-instances/fattn-vec*f16-f16.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    endif()
+
+    ggml_add_backend_library(ggml-cuda
+                             ${GGML_HEADERS_CUDA}
+                             ${GGML_SOURCES_CUDA}
+                            )
+
+    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
+
+    if (GGML_CUDA_GRAPHS)
+        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
+    endif()
+
+    if (GGML_CUDA_FORCE_MMQ)
+        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+    endif()
+
+    if (GGML_CUDA_FORCE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        add_compile_definitions(GGML_CUDA_NO_VMM)
+    endif()
+
+    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        add_compile_definitions(GGML_CUDA_F16)
+    endif()
+
+    if (GGML_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
+    if (GGML_STATIC)
+        if (WIN32)
+            # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+            target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+        else ()
+            target_link_libraries(ggml-cuda PRIVATE  CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+        endif()
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
+    endif()
+
+    if (GGML_CUDA_NO_VMM)
+        # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
+    else()
+        target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
+    endif()
+
+    set(CUDA_CXX_FLAGS "")
+
+    set(CUDA_FLAGS -use_fast_math)
+
+    if (GGML_FATAL_WARNINGS)
+        list(APPEND CUDA_FLAGS -Werror all-warnings)
+    endif()
+
+    if (GGML_ALL_WARNINGS AND NOT MSVC)
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+        endif()
+
+        execute_process(
+            COMMAND ${NVCC_CMD} -Xcompiler --version
+            OUTPUT_VARIABLE CUDA_CCFULLVER
+            ERROR_QUIET
+        )
+
+        if (NOT CUDA_CCFULLVER MATCHES clang)
+            set(CUDA_CCID "GNU")
+            execute_process(
+                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
+                OUTPUT_VARIABLE CUDA_CCVER
+                ERROR_QUIET
+            )
+        else()
+            if (CUDA_CCFULLVER MATCHES Apple)
+                set(CUDA_CCID "AppleClang")
+            else()
+                set(CUDA_CCID "Clang")
+            endif()
+            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
+        endif()
+
+        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
+
+        ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
+    endif()
+
+    if (NOT MSVC)
+        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
+    endif()
+
+    list(JOIN   CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
+
+    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
+        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
+    endif()
+
+    target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+else()
+    message(FATAL_ERROR "CUDA Toolkit not found")
+endif()
diff --git a/llama/ggml-cuda/acc.cu b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
similarity index 61%
rename from llama/ggml-cuda/acc.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
index 9ce47e60d..96bfe1c9d 100644
--- a/llama/ggml-cuda/acc.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "acc.cuh"
 
 static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh
new file mode 100644
index 000000000..1168ea1b2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_ACC_BLOCK_SIZE 256
+
+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/arange.cu b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cu
new file mode 100644
index 000000000..b5e495a24
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cu
@@ -0,0 +1,34 @@
+#include "arange.cuh"
+
+static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (nidx >= ne0) {
+        return;
+    }
+    dst[nidx] = start + step * nidx;
+}
+
+static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
+    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
+    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
+}
+
+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    float start;
+    float stop;
+    float step;
+    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
+    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
+    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
+
+    int64_t steps = (int64_t)ceil((stop - start) / step);
+    GGML_ASSERT(ggml_nelements(dst) == steps);
+
+    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh
new file mode 100644
index 000000000..41e74fdfc
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_ARANGE_BLOCK_SIZE 256
+
+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/argmax.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cu
similarity index 69%
rename from llama/ggml-cuda/argmax.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/argmax.cu
index 8bbfd7c05..5340eedc0 100644
--- a/llama/ggml-cuda/argmax.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include <algorithm>
 #include <cstdint>
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh
new file mode 100644
index 000000000..5b7223adc
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/argsort.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
similarity index 73%
rename from llama/ggml-cuda/argsort.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
index d9aaaa13c..607ded855 100644
--- a/llama/ggml-cuda/argsort.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "argsort.cuh"
 
 template<typename T>
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh
new file mode 100644
index 000000000..68a001547
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/binbcast.cu b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
similarity index 91%
rename from llama/ggml-cuda/binbcast.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
index 40b9fcbe1..c7b6be4e2 100644
--- a/llama/ggml-cuda/binbcast.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "binbcast.cuh"
 #include <cstdint>
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh
new file mode 100644
index 000000000..3ac1c9b03
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu
new file mode 100644
index 000000000..8009a3e3d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu
@@ -0,0 +1,34 @@
+#include "clamp.cuh"
+
+static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
+
+static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
+}
+
+
+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh
new file mode 100644
index 000000000..7f9559dd1
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CLAMP_BLOCK_SIZE 256
+
+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
similarity index 94%
rename from llama/ggml-cuda/common.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
index 2a40b8499..2c0a56226 100644
--- a/llama/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h"
diff --git a/llama/ggml-cuda/concat.cu b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cu
similarity index 85%
rename from llama/ggml-cuda/concat.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/concat.cu
index d8c473913..5eb9f08da 100644
--- a/llama/ggml-cuda/concat.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "concat.cuh"
 
 // contiguous kernels
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh
new file mode 100644
index 000000000..aa506a05f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONCAT_BLOCK_SIZE 256
+
+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/conv-transpose-1d.cu b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cu
similarity index 72%
rename from llama/ggml-cuda/conv-transpose-1d.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cu
index da53e9469..b1e94d6f7 100644
--- a/llama/ggml-cuda/conv-transpose-1d.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "conv-transpose-1d.cuh"
 
 static  __global__ void conv_transpose_1d_kernel(
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh
new file mode 100644
index 000000000..6c2cf666b
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/convert.cu b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
similarity index 95%
rename from llama/ggml-cuda/convert.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
index 6ddb87fc3..5b0dfacef 100644
--- a/llama/ggml-cuda/convert.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "convert.cuh"
 #include "dequantize.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
new file mode 100644
index 000000000..5394be9f1
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh
@@ -0,0 +1,13 @@
+#include "common.cuh"
+
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+
+template<typename T>
+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
+
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<half> to_fp16_cuda_t;
+
+to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
+
+to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
diff --git a/llama/ggml-cuda/count-equal.cu b/ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cu
similarity index 62%
rename from llama/ggml-cuda/count-equal.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cu
index e4496fc1b..08898115d 100644
--- a/llama/ggml-cuda/count-equal.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "count-equal.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cuh
new file mode 100644
index 000000000..8467da79e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
+
+void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/cpy.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
similarity index 94%
rename from llama/ggml-cuda/cpy.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
index ffdef8c43..54c0f66d2 100644
--- a/llama/ggml-cuda/cpy.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "cpy.cuh"
 
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh
new file mode 100644
index 000000000..28b06cdda
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+#define CUDA_CPY_BLOCK_SIZE 64
+
+void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
+
+void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
diff --git a/llama/ggml-cuda/cross-entropy-loss.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cu
similarity index 82%
rename from llama/ggml-cuda/cross-entropy-loss.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cu
index 5bfddc79b..ed09406a8 100644
--- a/llama/ggml-cuda/cross-entropy-loss.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "cross-entropy-loss.cuh"
 #include "sum.cuh"
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cuh
new file mode 100644
index 000000000..9ec7152ff
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cuh
@@ -0,0 +1,7 @@
+#include "common.cuh"
+
+#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
+
+void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/dequantize.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/dequantize.cuh
similarity index 68%
rename from llama/ggml-cuda/dequantize.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/dequantize.cuh
index 016de0db6..bd3c2d9db 100644
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/dequantize.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 
 static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
diff --git a/llama/ggml-cuda/diagmask.cu b/ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cu
similarity index 58%
rename from llama/ggml-cuda/diagmask.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cu
index e80a953ae..4b713ba22 100644
--- a/llama/ggml-cuda/diagmask.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "diagmask.cuh"
 
 static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cuh
new file mode 100644
index 000000000..6cdbef17e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+
+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/fattn-common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
similarity index 95%
rename from llama/ggml-cuda/fattn-common.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
index 011654d31..ee9752da6 100644
--- a/llama/ggml-cuda/fattn-common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "common.cuh"
diff --git a/llama/ggml-cuda/fattn-tile-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu
similarity index 91%
rename from llama/ggml-cuda/fattn-tile-f16.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu
index 72d265ef2..4d314dacb 100644
--- a/llama/ggml-cuda/fattn-tile-f16.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-tile-f16.cuh"
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cuh
new file mode 100644
index 000000000..ffc587842
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/fattn-tile-f32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu
similarity index 91%
rename from llama/ggml-cuda/fattn-tile-f32.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu
index 3be1c7a62..bb3360447 100644
--- a/llama/ggml-cuda/fattn-tile-f32.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-tile-f32.cuh"
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cuh
new file mode 100644
index 000000000..b1c546c80
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/fattn-vec-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh
similarity index 93%
rename from llama/ggml-cuda/fattn-vec-f16.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index 334a05c37..34a2992c7 100644
--- a/llama/ggml-cuda/fattn-vec-f16.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "fattn-common.cuh"
 
diff --git a/llama/ggml-cuda/fattn-vec-f32.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh
similarity index 92%
rename from llama/ggml-cuda/fattn-vec-f32.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh
index 0bb230004..a28fc8b7f 100644
--- a/llama/ggml-cuda/fattn-vec-f32.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "fattn-common.cuh"
 
diff --git a/llama/ggml-cuda/fattn-wmma-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
similarity index 94%
rename from llama/ggml-cuda/fattn-wmma-f16.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
index d82984f40..860d0e6dc 100644
--- a/llama/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "fattn-common.cuh"
 
diff --git a/llama/ggml-cuda/fattn.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
similarity index 92%
rename from llama/ggml-cuda/fattn.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
index 4828e9d84..0b26b0f8e 100644
--- a/llama/ggml-cuda/fattn.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "fattn-common.cuh"
 #include "fattn-tile-f16.cuh"
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cuh
new file mode 100644
index 000000000..ad3ca7a8d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/getrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
similarity index 84%
rename from llama/ggml-cuda/getrows.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
index 6cf1e516e..4c3703238 100644
--- a/llama/ggml-cuda/getrows.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "getrows.cuh"
 #include "dequantize.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
new file mode 100644
index 000000000..bbf130232
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
+
+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
similarity index 98%
rename from llama/ggml-cuda/ggml-cuda.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 0894fdad7..9286f8665 100644
--- a/llama/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "ggml-cuda.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
@@ -450,10 +424,7 @@ struct ggml_backend_cuda_buffer_context {
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     delete ctx;
-
-    // TODO: this needs to be freed in cuda and hipblas backends because
-    // the cuda backend implementation compiled with msvc
-    free(buffer);
+    delete buffer;
 }
 
 static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
diff --git a/llama/ggml-cuda/im2col.cu b/ml/backend/ggml/ggml/src/ggml-cuda/im2col.cu
similarity index 78%
rename from llama/ggml-cuda/im2col.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/im2col.cu
index 0ceaa02c9..86a54e42b 100644
--- a/llama/ggml-cuda/im2col.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/im2col.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "im2col.cuh"
 
 template <typename T>
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/im2col.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/im2col.cuh
new file mode 100644
index 000000000..1ce8fae4d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/im2col.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_IM2COL_BLOCK_SIZE 256
+
+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/mma.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mma.cuh
similarity index 86%
rename from llama/ggml-cuda/mma.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/mma.cuh
index 557cdcd1e..7d11540af 100644
--- a/llama/ggml-cuda/mma.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mma.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 
 struct mma_int_A_I16K4 {
diff --git a/llama/ggml-cuda/mmq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
similarity index 80%
rename from llama/ggml-cuda/mmq.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
index 0dc63b31b..270251df4 100644
--- a/llama/ggml-cuda/mmq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "mmq.cuh"
 
 void ggml_cuda_op_mul_mat_q(
diff --git a/llama/ggml-cuda/mmq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
similarity index 98%
rename from llama/ggml-cuda/mmq.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
index 1da4680a2..3cd508a1d 100644
--- a/llama/ggml-cuda/mmq.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "common.cuh"
diff --git a/llama/ggml-cuda/mmv.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
similarity index 89%
rename from llama/ggml-cuda/mmv.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
index 37559c742..ac45f2d17 100644
--- a/llama/ggml-cuda/mmv.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "mmv.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
new file mode 100644
index 000000000..78a1cd4a6
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh
@@ -0,0 +1,12 @@
+#include "common.cuh"
+
+// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
+#define MMV_MAX_ROWS 512
+
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+void ggml_cuda_op_mul_mat_vec(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
diff --git a/llama/ggml-cuda/mmvq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
similarity index 93%
rename from llama/ggml-cuda/mmvq.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
index 19ea9aa96..e3b912d87 100644
--- a/llama/ggml-cuda/mmvq.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "mmvq.cuh"
 #include "vecdotq.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh
new file mode 100644
index 000000000..d9e42fdd6
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh
@@ -0,0 +1,9 @@
+#include "common.cuh"
+
+#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
+
+void ggml_cuda_op_mul_mat_vec_q(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream);
diff --git a/llama/ggml-cuda/norm.cu b/ml/backend/ggml/ggml/src/ggml-cuda/norm.cu
similarity index 85%
rename from llama/ggml-cuda/norm.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/norm.cu
index 6bc05ff70..133e219f0 100644
--- a/llama/ggml-cuda/norm.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/norm.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "norm.cuh"
 
 template <int block_size>
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/norm.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/norm.cuh
new file mode 100644
index 000000000..431a8f74d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/norm.cuh
@@ -0,0 +1,7 @@
+#include "common.cuh"
+
+void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/opt-step-adamw.cu b/ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cu
similarity index 70%
rename from llama/ggml-cuda/opt-step-adamw.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cu
index 4bde5c599..35154f299 100644
--- a/llama/ggml-cuda/opt-step-adamw.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "ggml-impl.h"
 #include "opt-step-adamw.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cuh
new file mode 100644
index 000000000..58d6f6e5d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
+
+void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/out-prod.cu b/ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cu
similarity index 57%
rename from llama/ggml-cuda/out-prod.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cu
index fb2cc3838..619cfdcb5 100644
--- a/llama/ggml-cuda/out-prod.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "out-prod.cuh"
 
 #include <cstdint>
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cuh
new file mode 100644
index 000000000..a0046f5f8
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cuh
@@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/pad.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
similarity index 74%
rename from llama/ggml-cuda/pad.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
index aa61c0ada..39fd4b165 100644
--- a/llama/ggml-cuda/pad.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "pad.cuh"
 
 static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
new file mode 100644
index 000000000..e2ededc3c
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh
@@ -0,0 +1,6 @@
+#include "common.cuh"
+
+#define CUDA_PAD_BLOCK_SIZE 256
+
+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/pool2d.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cu
similarity index 72%
rename from llama/ggml-cuda/pool2d.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cu
index adbf1b551..c6d51e4d6 100644
--- a/llama/ggml-cuda/pool2d.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "pool2d.cuh"
 
 template <typename Ti, typename To>
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cuh
new file mode 100644
index 000000000..7841292bc
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_POOL2D_BLOCK_SIZE 256
+
+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/quantize.cu b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
similarity index 81%
rename from llama/ggml-cuda/quantize.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
index 60341bee8..1702e4ce2 100644
--- a/llama/ggml-cuda/quantize.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "quantize.cuh"
 #include <cstdint>
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
new file mode 100644
index 000000000..03bf322b9
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "common.cuh"
+#include "mmq.cuh"
+
+#include <cstdint>
+
+#define CUDA_QUANTIZE_BLOCK_SIZE     256
+#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
+
+static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
+static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
+
+typedef void (*quantize_cuda_t)(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_mmq_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
diff --git a/llama/ggml-cuda/rope.cu b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
similarity index 94%
rename from llama/ggml-cuda/rope.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
index fc9f6f2f4..2c84778d2 100644
--- a/llama/ggml-cuda/rope.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "rope.cuh"
 
 struct rope_corr_dims {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/rope.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cuh
new file mode 100644
index 000000000..0f787a0b2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_ROPE_BLOCK_SIZE 256
+
+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu
new file mode 100644
index 000000000..1405e066e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu
@@ -0,0 +1,31 @@
+#include "scale.cuh"
+
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
+}
+
+void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/scale.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cuh
new file mode 100644
index 000000000..8ff75c829
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_SCALE_BLOCK_SIZE 256
+
+void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/softmax.cu b/ml/backend/ggml/ggml/src/ggml-cuda/softmax.cu
similarity index 86%
rename from llama/ggml-cuda/softmax.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/softmax.cu
index 52aad62f6..c24abae1f 100644
--- a/llama/ggml-cuda/softmax.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/softmax.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "softmax.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/softmax.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/softmax.cuh
new file mode 100644
index 000000000..4ef4ff86c
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/softmax.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
+
+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/sum.cu b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
similarity index 54%
rename from llama/ggml-cuda/sum.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
index e1f0b86e8..e0dafc1d2 100644
--- a/llama/ggml-cuda/sum.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
 #define USE_CUB
 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/sum.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cuh
new file mode 100644
index 000000000..8cadc3736
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
+
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
new file mode 100644
index 000000000..38dbf1b5e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu
@@ -0,0 +1,39 @@
+#include "sumrows.cuh"
+
+static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int col = threadIdx.x;
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += blockDim.x) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
+void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows, 1, 1);
+    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
+}
+
+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
new file mode 100644
index 000000000..191db1c13
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+
+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
new file mode 100644
index 000000000..6696a2384
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
new file mode 100644
index 000000000..dd070db28
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
new file mode 100644
index 000000000..54dcde6f5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
new file mode 100644
index 000000000..4ec22f791
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
new file mode 100644
index 000000000..3c15bf7f0
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
new file mode 100644
index 000000000..7e61b5fdc
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
new file mode 100644
index 000000000..fdb15b580
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
new file mode 100644
index 000000000..0f7c417d2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
new file mode 100644
index 000000000..851f33c43
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
new file mode 100644
index 000000000..763809cbe
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
new file mode 100644
index 000000000..f2a276e50
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
new file mode 100644
index 000000000..cb227f6f5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
new file mode 100644
index 000000000..97ac0520c
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
new file mode 100644
index 000000000..c772b4263
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
new file mode 100644
index 000000000..5cb743081
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
new file mode 100644
index 000000000..98a709d17
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
new file mode 100644
index 000000000..4f2f947ae
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
new file mode 100644
index 000000000..11f96b6f6
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
new file mode 100644
index 000000000..b39bdc061
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
new file mode 100644
index 000000000..bbd6a2c7f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
new file mode 100644
index 000000000..9d84ff2b1
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
new file mode 100644
index 000000000..bc8a5bff6
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
new file mode 100644
index 000000000..a679100c8
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
new file mode 100644
index 000000000..8f21bccf7
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
new file mode 100644
index 000000000..858b00fd7
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
new file mode 100644
index 000000000..0fc8011fa
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
new file mode 100644
index 000000000..261fdf623
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
new file mode 100644
index 000000000..0fb824738
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
new file mode 100644
index 000000000..a9d9d089b
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
new file mode 100644
index 000000000..7d7b27920
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
new file mode 100644
index 000000000..a092ee2d5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
new file mode 100644
index 000000000..db55927a1
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
new file mode 100644
index 000000000..c3c21cefa
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
new file mode 100644
index 000000000..35dd9f520
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
new file mode 100644
index 000000000..050c22ac7
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
new file mode 100644
index 000000000..de4866c5e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
new file mode 100644
index 000000000..57a10bc4b
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
new file mode 100644
index 000000000..e0f08b46a
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
new file mode 100644
index 000000000..1c8e8a467
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
new file mode 100644
index 000000000..cefed83fb
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
new file mode 100644
index 000000000..aede6e358
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
new file mode 100644
index 000000000..1a1a92c78
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
new file mode 100644
index 000000000..ad667473d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f16.cuh"
+
+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
new file mode 100644
index 000000000..c499f455d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
new file mode 100644
index 000000000..8286ebf37
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
new file mode 100644
index 000000000..458786882
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
new file mode 100644
index 000000000..d89103ce0
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
new file mode 100644
index 000000000..bb75fd42f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
new file mode 100644
index 000000000..b1629817e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
new file mode 100644
index 000000000..d8657604d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
new file mode 100644
index 000000000..2e5bd2f1a
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
new file mode 100644
index 000000000..be5f302d9
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
new file mode 100644
index 000000000..8dd91cd72
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
new file mode 100644
index 000000000..4cb791502
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
new file mode 100644
index 000000000..09dea4267
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
new file mode 100644
index 000000000..0fbb60769
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
new file mode 100644
index 000000000..2aeab83b2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
new file mode 100644
index 000000000..599415b49
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
new file mode 100644
index 000000000..e4f8e3083
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
new file mode 100644
index 000000000..34d166527
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
new file mode 100644
index 000000000..4bebef45a
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
new file mode 100644
index 000000000..326468da2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
new file mode 100644
index 000000000..511b58f4e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
new file mode 100644
index 000000000..d9906d142
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
new file mode 100644
index 000000000..f61c183ab
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
new file mode 100644
index 000000000..c10450fd2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
new file mode 100644
index 000000000..2d5cb195c
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
new file mode 100644
index 000000000..b384f34d7
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
new file mode 100644
index 000000000..446e293b1
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
new file mode 100644
index 000000000..6f4302988
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
new file mode 100644
index 000000000..1cd8ba88f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
new file mode 100644
index 000000000..1ee2eab65
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
new file mode 100644
index 000000000..2bc77816a
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
new file mode 100644
index 000000000..d55ced08b
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
new file mode 100644
index 000000000..8361e99c4
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
new file mode 100644
index 000000000..7507a67c4
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
new file mode 100644
index 000000000..61f050b23
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
new file mode 100644
index 000000000..d4a49d9c9
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
new file mode 100644
index 000000000..d14627897
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
new file mode 100644
index 000000000..e73f917a1
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
new file mode 100644
index 000000000..d40825dfc
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
new file mode 100644
index 000000000..b5c6869f4
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
new file mode 100644
index 000000000..4e21b0cca
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
new file mode 100644
index 000000000..2eac321b3
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
new file mode 100644
index 000000000..f7d2c3b4e
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
new file mode 100644
index 000000000..a013f400b
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f32.cuh"
+
+DECL_FATTN_VEC_F32_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
new file mode 100644
index 000000000..2d94e65c2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-wmma-f16.cuh"
+
+DECL_FATTN_WMMA_F16_CASE(64, 16, float);
+DECL_FATTN_WMMA_F16_CASE(80, 16, float);
+DECL_FATTN_WMMA_F16_CASE(96, 16, float);
+DECL_FATTN_WMMA_F16_CASE(112, 16, float);
+DECL_FATTN_WMMA_F16_CASE(128, 16, float);
+DECL_FATTN_WMMA_F16_CASE(256, 16, float);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
new file mode 100644
index 000000000..c3d9df3c4
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
@@ -0,0 +1,9 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-wmma-f16.cuh"
+
+DECL_FATTN_WMMA_F16_CASE(64, 32, float);
+DECL_FATTN_WMMA_F16_CASE(80, 32, float);
+DECL_FATTN_WMMA_F16_CASE(96, 32, float);
+DECL_FATTN_WMMA_F16_CASE(112, 32, float);
+DECL_FATTN_WMMA_F16_CASE(128, 32, float);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
new file mode 100644
index 000000000..bb680e401
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-wmma-f16.cuh"
+
+DECL_FATTN_WMMA_F16_CASE(64, 16, half);
+DECL_FATTN_WMMA_F16_CASE(80, 16, half);
+DECL_FATTN_WMMA_F16_CASE(96, 16, half);
+DECL_FATTN_WMMA_F16_CASE(112, 16, half);
+DECL_FATTN_WMMA_F16_CASE(128, 16, half);
+DECL_FATTN_WMMA_F16_CASE(256, 16, half);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
new file mode 100644
index 000000000..073f71b1f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-wmma-f16.cuh"
+
+DECL_FATTN_WMMA_F16_CASE(64, 32, half);
+DECL_FATTN_WMMA_F16_CASE(80, 32, half);
+DECL_FATTN_WMMA_F16_CASE(96, 32, half);
+DECL_FATTN_WMMA_F16_CASE(112, 32, half);
+DECL_FATTN_WMMA_F16_CASE(128, 32, half);
+DECL_FATTN_WMMA_F16_CASE(256, 32, half);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
new file mode 100644
index 000000000..d30710c5f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
@@ -0,0 +1,8 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-wmma-f16.cuh"
+
+DECL_FATTN_WMMA_F16_CASE(64, 8, half);
+DECL_FATTN_WMMA_F16_CASE(96, 8, half);
+DECL_FATTN_WMMA_F16_CASE(128, 8, half);
+DECL_FATTN_WMMA_F16_CASE(256, 8, half);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
new file mode 100755
index 000000000..d7874e6ea
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+from glob import glob
+import os
+
+TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
+
+SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-vec-f{vkq_size}.cuh"
+
+DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v});
+"""
+
+SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-wmma-f16.cuh"
+
+"""
+
+SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n"
+
+TYPES_MMQ = [
+    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
+    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
+    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
+    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
+]
+
+SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE({type});
+"""
+
+
+def get_short_name(long_quant_name):
+    return long_quant_name.replace("GGML_TYPE_", "").lower()
+
+
+def get_head_sizes(type_k, type_v):
+    if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16":
+        return [64, 128, 256]
+    if type_k == "GGML_TYPE_F16":
+        return [64, 128]
+    return [128]
+
+
+for filename in glob("*.cu"):
+    os.remove(filename)
+
+for vkq_size in [16, 32]:
+    for type_k in TYPES_KV:
+        for type_v in TYPES_KV:
+            for head_size in get_head_sizes(type_k, type_v):
+                with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
+                    f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
+
+for kq_acc_t in ["half", "float"]:
+    for cols_per_block in [8, 16, 32]:
+        if kq_acc_t == "float" and cols_per_block == 8:
+            continue
+
+        with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f:
+            f.write(SOURCE_FATTN_WMMA_START)
+
+            for head_size in [64, 80, 96, 112, 128, 256]:
+                if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32
+                    continue
+                if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance
+                    continue
+                f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size))
+
+for type in TYPES_MMQ:
+    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
+        f.write(SOURCE_MMQ.format(type=type))
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
new file mode 100644
index 000000000..84ec85029
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
new file mode 100644
index 000000000..583c4e5a5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
new file mode 100644
index 000000000..edaf1560d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
new file mode 100644
index 000000000..233d9342c
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
new file mode 100644
index 000000000..6092dc713
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
new file mode 100644
index 000000000..1d5bd201f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
new file mode 100644
index 000000000..eb02fab00
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
new file mode 100644
index 000000000..1eb3b7430
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu
new file mode 100644
index 000000000..6415369dc
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q2_K);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu
new file mode 100644
index 000000000..ffb6213af
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q3_K);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu
new file mode 100644
index 000000000..0c0b0c8a8
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu
new file mode 100644
index 000000000..ee67f6942
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu
new file mode 100644
index 000000000..9eeb3cd7f
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_K);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu
new file mode 100644
index 000000000..cc57fb975
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_0);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu
new file mode 100644
index 000000000..721ac790c
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_1);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu
new file mode 100644
index 000000000..a2e90ffd5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_K);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu
new file mode 100644
index 000000000..470938fef
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q6_K);
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu
new file mode 100644
index 000000000..974477bbb
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q8_0);
diff --git a/llama/ggml-cuda/tsembd.cu b/ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cu
similarity index 59%
rename from llama/ggml-cuda/tsembd.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cu
index c60367838..153ddbcda 100644
--- a/llama/ggml-cuda/tsembd.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "tsembd.cuh"
 
 static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cuh
new file mode 100644
index 000000000..84340e3d7
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
+
+void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/unary.cu b/ml/backend/ggml/ggml/src/ggml-cuda/unary.cu
similarity index 92%
rename from llama/ggml-cuda/unary.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/unary.cu
index e20cba020..81fc92202 100644
--- a/llama/ggml-cuda/unary.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/unary.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "unary.cuh"
 
 static __global__ void neg_f32(const float * x, float * dst, const int k) {
diff --git a/llama/ggml-cuda/unary.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/unary.cuh
similarity index 58%
rename from llama/ggml-cuda/unary.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/unary.cuh
index 3a9161bf9..c91936728 100644
--- a/llama/ggml-cuda/unary.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/unary.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 
 #define CUDA_NEG_BLOCK_SIZE 256
diff --git a/llama/ggml-cuda/upscale.cu b/ml/backend/ggml/ggml/src/ggml-cuda/upscale.cu
similarity index 63%
rename from llama/ggml-cuda/upscale.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/upscale.cu
index 19c8f2a18..cf513c3ad 100644
--- a/llama/ggml-cuda/upscale.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/upscale.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "upscale.cuh"
 
 static __global__ void upscale_f32(const float * x, float * dst,
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/upscale.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/upscale.cuh
new file mode 100644
index 000000000..d4d765230
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/upscale.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_UPSCALE_BLOCK_SIZE 256
+
+void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/llama/ggml-cuda/vecdotq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh
similarity index 96%
rename from llama/ggml-cuda/vecdotq.cuh
rename to ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh
index 43719cbd7..40091a0ef 100644
--- a/llama/ggml-cuda/vecdotq.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include <cstdint>
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h
new file mode 100644
index 000000000..db9f6a165
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
diff --git a/llama/ggml-cuda/vendors/hip.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
similarity index 85%
rename from llama/ggml-cuda/vendors/hip.h
rename to ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
index 7b3102f39..c905b15d7 100644
--- a/llama/ggml-cuda/vendors/hip.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include <hip/hip_runtime.h>
diff --git a/llama/ggml-cuda/vendors/musa.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/musa.h
similarity index 83%
rename from llama/ggml-cuda/vendors/musa.h
rename to ml/backend/ggml/ggml/src/ggml-cuda/vendors/musa.h
index 7b1a4ac41..6cc1b69ee 100644
--- a/llama/ggml-cuda/vendors/musa.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/musa.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include <musa_runtime.h>
diff --git a/llama/ggml-cuda/wkv6.cu b/ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu
similarity index 71%
rename from llama/ggml-cuda/wkv6.cu
rename to ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu
index fe4e5b9d4..42578341a 100644
--- a/llama/ggml-cuda/wkv6.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "common.cuh"
 #include "wkv6.cuh"
 
diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cuh
new file mode 100644
index 000000000..a7124ee51
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_WKV_BLOCK_SIZE 64
+
+void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
new file mode 100644
index 000000000..b15fbd24d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt
@@ -0,0 +1,104 @@
+if (NOT EXISTS $ENV{ROCM_PATH})
+    if (NOT EXISTS /opt/rocm)
+        set(ROCM_PATH /usr)
+    else()
+        set(ROCM_PATH /opt/rocm)
+    endif()
+else()
+    set(ROCM_PATH $ENV{ROCM_PATH})
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH  ${ROCM_PATH})
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
+
+# CMake on Windows doesn't support the HIP language yet
+if (WIN32)
+    set(CXX_IS_HIPCC TRUE)
+else()
+    string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
+endif()
+
+if (CXX_IS_HIPCC)
+    if (LINUX)
+        if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+            message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+        endif()
+
+        message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
+                " Prefer setting the HIP compiler directly. See README for details.")
+    endif()
+else()
+    # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+    if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+    endif()
+    cmake_minimum_required(VERSION 3.21)
+    enable_language(HIP)
+endif()
+
+find_package(hip     REQUIRED)
+find_package(hipblas REQUIRED)
+find_package(rocblas REQUIRED)
+
+message(STATUS "HIP and hipBLAS found")
+
+file(GLOB   GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
+list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
+
+file(GLOB   GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
+file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
+list(APPEND GGML_SOURCES_ROCM ${SRCS})
+file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
+list(APPEND GGML_SOURCES_ROCM ${SRCS})
+
+if (GGML_CUDA_FA_ALL_QUANTS)
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+else()
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
+endif()
+
+ggml_add_backend_library(ggml-hip
+                         ${GGML_HEADERS_ROCM}
+                         ${GGML_SOURCES_ROCM}
+                        )
+
+# TODO: do not use CUDA definitions for HIP
+target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
+
+add_compile_definitions(GGML_USE_HIP)
+
+if (GGML_HIP_UMA)
+    add_compile_definitions(GGML_HIP_UMA)
+endif()
+
+if (GGML_CUDA_FORCE_MMQ)
+    add_compile_definitions(GGML_CUDA_FORCE_MMQ)
+endif()
+
+if (GGML_CUDA_FORCE_CUBLAS)
+    add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
+endif()
+
+if (GGML_CUDA_NO_PEER_COPY)
+    add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+endif()
+
+if (CXX_IS_HIPCC)
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+    target_link_libraries(ggml-hip PRIVATE hip::device)
+else()
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
+endif()
+
+if (GGML_STATIC)
+    message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+endif()
+
+target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)
diff --git a/llama/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h
similarity index 93%
rename from llama/ggml-impl.h
rename to ml/backend/ggml/ggml/src/ggml-impl.h
index 46760fb32..549772c57 100644
--- a/llama/ggml-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-impl.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 // GGML internal header
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-metal/CMakeLists.txt
new file mode 100644
index 000000000..89fcde2fa
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-metal/CMakeLists.txt
@@ -0,0 +1,121 @@
+find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+
+message(STATUS "Metal framework found")
+
+ggml_add_backend_library(ggml-metal
+                         ggml-metal.m
+                        )
+
+target_link_libraries(ggml-metal PRIVATE
+                      ${FOUNDATION_LIBRARY}
+                      ${METAL_FRAMEWORK}
+                      ${METALKIT_FRAMEWORK}
+                      )
+
+if (GGML_METAL_NDEBUG)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+endif()
+
+if (GGML_METAL_USE_BF16)
+    add_compile_definitions(GGML_METAL_USE_BF16)
+endif()
+
+# copy metal files to bin directory
+configure_file(../ggml-common.h  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h     COPYONLY)
+configure_file(ggml-metal.metal  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal  COPYONLY)
+configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
+
+if (GGML_METAL_EMBED_LIBRARY)
+    enable_language(ASM)
+
+    add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
+
+    set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
+    set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+    set(METALLIB_IMPL   "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
+
+    file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+
+    # merge ggml-common.h and ggml-metal.metal into a single file
+    set(METALLIB_EMBED_ASM        "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+    set(METALLIB_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
+    set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
+
+    add_custom_command(
+        OUTPUT ${METALLIB_EMBED_ASM}
+        COMMAND echo "Embedding Metal library"
+        COMMAND sed -e '/__embed_ggml-common.h__/r         ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d'         < ${METALLIB_SOURCE}           > ${METALLIB_SOURCE_EMBED_TMP}
+        COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}'   -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
+        COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
+        COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
+        COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
+        DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
+        COMMENT "Generate assembly for embedded Metal library"
+    )
+
+    target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
+else()
+    if (GGML_METAL_SHADER_DEBUG)
+        # custom command to do the following:
+        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+        #
+        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
+        #       disabling fast math is needed in order to pass tests/test-backend-ops
+        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
+        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
+        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+        set(XC_FLAGS -fno-fast-math -fno-inline -g)
+    else()
+        set(XC_FLAGS -O3)
+    endif()
+
+    # Append macOS metal versioning flags
+    if (GGML_METAL_MACOSX_VERSION_MIN)
+        message(STATUS "Adding  -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
+        list   (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
+    endif()
+
+    if (GGML_METAL_STD)
+        message(STATUS "Adding  -std=${GGML_METAL_STD} flag to metal compilation")
+        list   (APPEND XC_FLAGS -std=${GGML_METAL_STD})
+    endif()
+
+    add_custom_command(
+        OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+        COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
+        COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
+        DEPENDS ggml-metal.metal ggml-common.h
+        COMMENT "Compiling Metal kernels"
+        )
+
+    # FIXME: only add to the ggml-metal target?
+    add_custom_target(
+        ggml-metal-lib ALL
+        DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        )
+endif() # GGML_METAL_EMBED_LIBRARY
+
+if (NOT GGML_METAL_EMBED_LIBRARY)
+    install(
+        FILES src/ggml-metal/ggml-metal.metal
+        PERMISSIONS
+            OWNER_READ
+            OWNER_WRITE
+            GROUP_READ
+            WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+        install(
+            FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            DESTINATION ${CMAKE_INSTALL_BINDIR}
+        )
+endif()
diff --git a/llama/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
similarity index 99%
rename from llama/ggml-metal-embed.metal
rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
index 7f4666c93..2e51b87a0 100644
--- a/llama/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal
@@ -1,58 +1,7 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
+// Code generated Fri Jan 10 13:05:45 PST 2025. DO NOT EDIT.
 #define GGML_COMMON_DECL_METAL
 #define GGML_COMMON_IMPL_METAL
 #if defined(GGML_METAL_EMBED_LIBRARY)
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef GGML_COMMON_DECL
 
 #if defined(GGML_COMMON_DECL_C)
@@ -1910,32 +1859,6 @@ GGML_TABLE_END()
 // TODO: this should not be a relative path, but can't figure out how to set Metal include paths in Package.swift
 #include "../ggml-common.h"
 #endif
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef GGML_METAL_IMPL
 #define GGML_METAL_IMPL
 
diff --git a/llama/ggml-metal-embed_darwin_arm64.s b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s
similarity index 87%
rename from llama/ggml-metal-embed_darwin_arm64.s
rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s
index a108c8251..47c729a6a 100644
--- a/llama/ggml-metal-embed_darwin_arm64.s
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s
@@ -3,4 +3,4 @@
 _ggml_metallib_start:
 .incbin "ggml-metal-embed.metal"
 .globl _ggml_metallib_end
-_ggml_metallib_end:
\ No newline at end of file
+_ggml_metallib_end:
diff --git a/llama/ggml-metal-impl.h b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
similarity index 81%
rename from llama/ggml-metal-impl.h
rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
index 19103fb59..e3dc25f16 100644
--- a/llama/ggml-metal-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef GGML_METAL_IMPL
 #define GGML_METAL_IMPL
 
diff --git a/llama/ggml-metal_darwin_arm64.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
similarity index 99%
rename from llama/ggml-metal_darwin_arm64.m
rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
index d72129c3a..318addecc 100644
--- a/llama/ggml-metal_darwin_arm64.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #import "ggml-metal.h"
 
 #import "ggml-impl.h"
@@ -4246,6 +4220,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     }
 
     free(ctx);
+    free(buffer);
 }
 
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
diff --git a/llama/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
similarity index 99%
rename from llama/ggml-metal.metal
rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
index 1bca09725..204c93e6f 100644
--- a/llama/ggml-metal.metal
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #define GGML_COMMON_DECL_METAL
 #define GGML_COMMON_IMPL_METAL
 #if defined(GGML_METAL_EMBED_LIBRARY)
diff --git a/ml/backend/ggml/ggml/src/ggml-metal/metal.go b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
new file mode 100644
index 000000000..1025e205d
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -0,0 +1,9 @@
+//go:build darwin && arm64
+
+package metal
+
+//go:generate sh -c "{ echo // Code generated $(date). DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"
+
+// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo LDFLAGS: -framework Metal -framework MetalKit
+import "C"
diff --git a/ml/backend/ggml/ggml/src/ggml-opt.cpp b/ml/backend/ggml/ggml/src/ggml-opt.cpp
new file mode 100644
index 000000000..7c3e24103
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-opt.cpp
@@ -0,0 +1,854 @@
+#include "ggml-opt.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-impl.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cinttypes>
+#include <map>
+#include <random>
+#include <vector>
+
+struct ggml_opt_dataset {
+    struct ggml_context   * ctx    = nullptr;
+    ggml_backend_buffer_t   buf    = nullptr;
+    struct ggml_tensor    * data   = nullptr;
+    struct ggml_tensor    * labels = nullptr;
+
+    int64_t ndata       = -1;
+    int64_t ndata_shard = -1;
+    size_t  nbs_data    = -1;
+    size_t  nbs_labels  = -1;
+
+    std::vector<int64_t> permutation;
+};
+
+struct ggml_opt_context {
+    ggml_backend_sched_t    backend_sched        = nullptr;
+    ggml_cgraph           * allocated_graph      = nullptr;
+    ggml_cgraph           * allocated_graph_copy = nullptr;
+    struct ggml_context   * ctx_static           = nullptr;
+    struct ggml_context   * ctx_static_cpu       = nullptr;
+    struct ggml_context   * ctx_compute          = nullptr;
+    struct ggml_context   * ctx_copy             = nullptr;
+    ggml_backend_buffer_t   buf_static           = nullptr;
+    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
+    std::mt19937            rng;
+
+    struct ggml_tensor * inputs  = nullptr;
+    struct ggml_tensor * outputs = nullptr;
+    struct ggml_tensor * labels  = nullptr;
+
+    struct ggml_tensor * loss     = nullptr;
+    struct ggml_tensor * pred     = nullptr;
+    struct ggml_tensor * ncorrect = nullptr;
+
+    struct ggml_cgraph * gf      = nullptr;
+    struct ggml_cgraph * gb_grad = nullptr;
+    struct ggml_cgraph * gb_opt  = nullptr;
+
+    int64_t iter               = 1;
+    int32_t opt_period         = 1;
+    int32_t opt_i              = 0;
+    bool    loss_per_datapoint = false;
+
+    ggml_opt_get_optimizer_params get_opt_pars = nullptr;
+    void * get_opt_pars_ud                     = nullptr;
+    struct ggml_tensor * adamw_params          = nullptr;
+};
+
+struct ggml_opt_result {
+    int64_t              ndata    = 0;
+    std::vector<float>   loss;
+    std::vector<int32_t> pred;
+    int64_t              ncorrect = 0;
+
+    int64_t opt_period         = -1;
+    bool    loss_per_datapoint = false;
+};
+
+// ====== Dataset ======
+
+ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
+    GGML_ASSERT(ne_datapoint >  0);
+    GGML_ASSERT(ne_label     >= 0);
+    GGML_ASSERT(ndata        >  0);
+    GGML_ASSERT(ndata_shard  >  0);
+
+    ggml_opt_dataset_t result = new ggml_opt_dataset;
+    result->ndata       = ndata;
+    result->ndata_shard = ndata_shard;
+
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ 2*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx = ggml_init(params);
+    }
+
+    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
+    result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
+
+    if (ne_label > 0) {
+        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
+        result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
+    } else {
+        result->labels = nullptr;
+        result->nbs_labels = 0;
+    }
+
+    result->buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_cpu_buffer_type());
+
+    const int64_t nshards = ndata/ndata_shard;
+    result->permutation.resize(nshards);
+    for (int64_t i = 0; i < nshards; ++i) {
+        result->permutation[i] = i;
+    }
+    return result;
+}
+
+void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
+    ggml_backend_buffer_free(dataset->buf);
+    ggml_free(dataset->ctx);
+    delete dataset;
+}
+
+struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
+    return dataset->data;
+}
+
+struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset) {
+    return dataset->labels;
+}
+
+void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata) {
+    GGML_ASSERT(idata <= dataset->ndata);
+
+    if (idata < 0) {
+        std::shuffle(dataset->permutation.begin(), dataset->permutation.end(), opt_ctx->rng);
+        return;
+    }
+
+    GGML_ASSERT(idata % dataset->ndata_shard == 0);
+    const int64_t ishard_max = idata / dataset->ndata_shard;
+    std::shuffle(dataset->permutation.begin(), dataset->permutation.begin() + ishard_max, opt_ctx->rng);
+}
+
+void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor * data_batch, struct ggml_tensor * labels_batch, int64_t ibatch) {
+    GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
+    GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
+    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+
+    const size_t nb_data_batch = ggml_nbytes(data_batch);
+    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
+    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
+
+    if (labels_batch) {
+        const size_t nb_labels_batch = ggml_nbytes(labels_batch);
+        GGML_ASSERT(nb_labels_batch == shards_per_batch*dataset->nbs_labels);
+    }
+
+    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
+
+    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
+        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
+
+        const char * ptr_data = (const char *) dataset->data->data + ishard*dataset->nbs_data;
+        ggml_backend_tensor_set(data_batch, ptr_data, ishard_batch*dataset->nbs_data, dataset->nbs_data);
+
+        if (!labels_batch) {
+            continue;
+        }
+
+        const char * ptr_labels = (const char *) dataset->labels->data + ishard*dataset->nbs_labels;
+        ggml_backend_tensor_set(labels_batch, ptr_labels, ishard_batch*dataset->nbs_labels, dataset->nbs_labels);
+    }
+}
+
+// ====== Model / Context ======
+
+struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
+    GGML_UNUSED(userdata);
+
+    ggml_opt_optimizer_params result;
+
+    result.adamw.alpha = 0.001f;
+    result.adamw.beta1 = 0.9f;
+    result.adamw.beta2 = 0.999f;
+    result.adamw.eps   = 1e-8f;
+    result.adamw.wd    = 0.0f;
+
+    return result;
+}
+
+struct ggml_opt_params ggml_opt_default_params(
+        ggml_backend_sched_t      backend_sched,
+        struct ggml_context     * ctx_compute,
+        struct ggml_tensor      * inputs,
+        struct ggml_tensor      * outputs,
+        enum ggml_opt_loss_type   loss_type) {
+    return {
+        /*backend_sched   =*/ backend_sched,
+        /*ctx_compute     =*/ ctx_compute,
+        /*inputs          =*/ inputs,
+        /*logits          =*/ outputs,
+        /*loss_type       =*/ loss_type,
+        /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
+        /*opt_period      =*/ 1,
+        /*get_opt_pars    =*/ ggml_opt_get_default_optimizer_params,
+        /*get_opt_pars_ud =*/ nullptr,
+    };
+}
+
+static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor) {
+    if (!tensor) {
+        return nullptr;
+    }
+
+    if (tensor_map.find(tensor) != tensor_map.end()) {
+        return tensor_map[tensor];
+    }
+
+    ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
+    tensor_map[tensor] = new_tensor;
+
+    new_tensor->op = tensor->op;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        new_tensor->nb[i] = tensor->nb[i];
+    }
+    new_tensor->flags = tensor->flags;
+    memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
+    strcpy(new_tensor->name, tensor->name);
+    new_tensor->data = tensor->data;
+    new_tensor->buffer = tensor->buffer;
+    new_tensor->extra = tensor->extra;
+    new_tensor->view_offs = tensor->view_offs;
+    new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]);
+    }
+
+    return new_tensor;
+}
+
+static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
+    std::map<ggml_tensor *, ggml_tensor *> tensor_map;
+
+    ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);
+
+    for (int i = 0; i < src->n_leafs; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
+    }
+    GGML_ASSERT(dst->n_leafs == src->n_leafs);
+    for (int i = 0; i < src->n_nodes; i++) {
+        ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
+    }
+    GGML_ASSERT(dst->n_nodes == src->n_nodes);
+    for (int i = 0; i < src->n_nodes; ++i) {
+        const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
+        const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+
+        GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+        GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
+
+        dst->grads[igrad_dst]     = src->grads[igrad_src];
+        dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
+    }
+
+    return dst;
+}
+
+static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
+    GGML_ASSERT(graph);
+    if (opt_ctx->allocated_graph == graph) {
+        return;
+    }
+
+    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+
+    {
+        ggml_init_params params = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = ggml_init(params);
+    }
+
+    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+
+    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+}
+
+ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
+    ggml_opt_context_t result = new struct ggml_opt_context;
+    result->backend_sched   = params.backend_sched;
+    result->ctx_compute     = params.ctx_compute;
+    result->inputs          = params.inputs;
+    result->outputs         = params.outputs;
+    result->opt_period      = params.opt_period;
+    result->get_opt_pars    = params.get_opt_pars;
+    result->get_opt_pars_ud = params.get_opt_pars_ud;
+
+    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
+    GGML_ASSERT(result->opt_period >= 1);
+
+    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
+        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
+
+    ggml_set_input(result->inputs);
+    ggml_set_output(result->outputs);
+
+    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    ggml_build_forward_expand(result->gf, result->outputs);
+
+    int n_param = 0;
+    for (int i = 0; i < result->gf->n_nodes; ++i) {
+        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
+            n_param++;
+        }
+    }
+
+    {
+        // The static context is used for:
+        //   - gradients (1 tensor per param if using gradient accumulation)
+        //   - optimizer momenta (2 tensors per param)
+        //   - labels
+        //   - loss + its gradient (up to 5 tensors)
+        //   - pred
+        //   - ncorrect (2 tensors).
+        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ size_meta,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx_static = ggml_init(params);
+    }
+    {
+        // The static cpu context is used for:
+        //   - optimizer parameters (1 for the entire context)
+        const size_t size_meta = 1 * ggml_tensor_overhead();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ size_meta,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        result->ctx_static_cpu = ggml_init(params);
+    }
+
+
+    switch (params.loss_type) {
+        case GGML_OPT_LOSS_TYPE_MEAN: {
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean");
+            result->loss_per_datapoint = true;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_SUM: {
+            result->loss = ggml_sum(result->ctx_static, result->outputs);
+            ggml_set_name(result->loss, "loss_sum");
+            result->loss_per_datapoint = false;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_cross_entropy");
+            if (result->opt_period > 1) {
+                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
+                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
+            }
+            result->loss_per_datapoint = true;
+            break;
+        }
+        case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
+            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
+            ggml_set_input(result->labels);
+            ggml_set_name(result->labels, "labels");
+            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
+            ggml_set_name(result->loss, "loss_error");
+            result->loss = ggml_sqr(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_squared_error");
+            result->loss = ggml_sum(result->ctx_static, result->loss);
+            ggml_set_name(result->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
+            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
+            ggml_set_name(result->loss, "loss_mean_squared_error");
+            result->loss_per_datapoint = true;
+            break;
+        }
+    }
+    ggml_set_output(result->loss);
+    ggml_set_loss(result->loss);
+    ggml_build_forward_expand(result->gf, result->loss);
+
+    result->pred = ggml_argmax(result->ctx_static, result->outputs);
+    ggml_set_name(result->pred, "pred");
+    ggml_set_output(result->pred);
+    ggml_build_forward_expand(result->gf, result->pred);
+
+    if (result->labels) {
+        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
+        ggml_set_name(result->ncorrect, "ncorrect");
+        ggml_set_output(result->ncorrect);
+        ggml_build_forward_expand(result->gf, result->ncorrect);
+    } else {
+        result->ncorrect = nullptr;
+    }
+
+    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        return result;
+    }
+
+    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
+    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
+    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
+
+    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
+        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+        ggml_graph_reset(result->gb_grad);
+        return result;
+    }
+
+    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
+
+    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
+    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
+
+    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
+    ggml_set_input(result->adamw_params);
+    ggml_set_name(result->adamw_params, "adamw_params");
+
+    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
+        struct ggml_tensor * node = result->gb_opt->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
+
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
+            ggml_build_forward_expand(result->gb_opt, opt_step);
+        }
+    }
+
+    result->buf_static = ggml_backend_alloc_ctx_tensors(
+        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+
+    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
+
+    ggml_graph_reset(result->gb_opt);
+
+    return result;
+}
+
+void ggml_opt_free(ggml_opt_context_t opt_ctx) {
+    if (opt_ctx == nullptr) {
+        return;
+    }
+    ggml_backend_buffer_free(opt_ctx->buf_static);
+    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
+    ggml_free(opt_ctx->ctx_static);
+    ggml_free(opt_ctx->ctx_static_cpu);
+    delete opt_ctx;
+}
+
+void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer) {
+    if (optimizer) {
+        ggml_graph_reset(opt_ctx->gb_opt);
+        opt_ctx->iter = 1;
+    } else {
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }
+}
+
+struct ggml_tensor * ggml_opt_inputs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->inputs;
+}
+
+struct ggml_tensor * ggml_opt_outputs(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->outputs;
+}
+
+struct ggml_tensor * ggml_opt_labels(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->labels;
+}
+
+struct ggml_tensor * ggml_opt_loss(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->loss;
+}
+
+struct ggml_tensor * ggml_opt_pred(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->pred;
+}
+
+struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx) {
+    return opt_ctx->ncorrect;
+}
+
+struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node) {
+    return ggml_graph_get_grad_acc(opt_ctx->gb_opt, node);
+}
+
+// ====== Optimization Result ======
+
+ggml_opt_result_t ggml_opt_result_init() {
+    return new ggml_opt_result;
+}
+
+void ggml_opt_result_free(ggml_opt_result_t result) {
+    delete result;
+}
+
+void ggml_opt_result_reset(ggml_opt_result_t result) {
+    result->ndata = 0;
+    result->loss.clear();
+    result->pred.clear();
+    result->ncorrect = 0;
+}
+
+void ggml_opt_result_ndata(ggml_opt_result_t result, int64_t * ndata) {
+    *ndata = result->ndata;
+}
+
+void ggml_opt_result_loss(ggml_opt_result_t result, double * loss, double * unc) {
+    const int64_t nbatches = result->loss.size(); // Number of physical batches.
+
+    if (nbatches == 0) {
+        *loss = 0.0;
+        *unc  = NAN;
+        return;
+    }
+
+    double sum         = 0.0;
+    double sum_squared = 0.0;
+
+    for (const float & loss : result->loss) {
+        // If the loss is per datapoint it was scaled by 1.0f/opt_period for each physical batch.
+        const float loss_scaled = result->loss_per_datapoint ? loss*result->opt_period : loss;
+        sum         += loss_scaled;
+        sum_squared += loss_scaled*loss_scaled;
+    }
+
+    const double mean = sum/nbatches;
+    *loss = result->loss_per_datapoint ? mean : sum;
+
+    if (!unc) {
+        return;
+    }
+
+    if (nbatches < 2) {
+        *unc = NAN;
+        return;
+    }
+
+    const double var_sum = sum_squared/nbatches - mean*mean; // variance without Bessel's correction, i.e. nbatches/(nbatches-1)
+    *unc = result->loss_per_datapoint ? sqrt(var_sum / (nbatches - 1)) : sqrt(var_sum * nbatches/(nbatches - 1));
+}
+
+void ggml_opt_result_pred(ggml_opt_result_t result, int32_t * pred) {
+    for (size_t i = 0; i < result->pred.size(); ++i) {
+        pred[i] = result->pred[i];
+    }
+}
+
+void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc) {
+    *accuracy = result->ncorrect >= 0 ? double(result->ncorrect) / double(result->ndata) : NAN;
+
+    if (!unc) {
+        return;
+    }
+
+    *unc = result->ncorrect >= 0 && result->ndata >= 2 ?
+        sqrt((*accuracy) * (1.0 - (*accuracy)) / double(result->ndata - 1)) : NAN;
+}
+
+// ====== Computation ======
+
+static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
+    if (graph != opt_ctx->gf) {
+        struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
+
+        GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
+        GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
+        GGML_ASSERT(opt_pars.adamw.eps   >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.wd    >= 0.0f);
+        GGML_ASSERT(opt_pars.adamw.wd    <= 1.0f);
+
+        // beta1, beta2 after applying warmup
+        const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
+        const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
+
+        float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
+        adamw_par_data[0] = opt_pars.adamw.alpha;
+        adamw_par_data[1] = opt_pars.adamw.beta1;
+        adamw_par_data[2] = opt_pars.adamw.beta2;
+        adamw_par_data[3] = opt_pars.adamw.eps;
+        adamw_par_data[4] = opt_pars.adamw.wd;
+        adamw_par_data[5] = beta1h;
+        adamw_par_data[6] = beta2h;
+    }
+
+    ggml_opt_alloc_graph(opt_ctx, graph);
+    ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
+
+    if (!result) {
+        return;
+    }
+
+    if (result->ndata == 0) {
+        result->loss_per_datapoint = opt_ctx->loss_per_datapoint;
+        result->opt_period         = opt_ctx->opt_period;
+    } else {
+        GGML_ASSERT(result->loss_per_datapoint == opt_ctx->loss_per_datapoint);
+        GGML_ASSERT(result->opt_period         == opt_ctx->opt_period);
+    }
+
+    const int64_t ndata = opt_ctx->outputs->ne[1];
+    GGML_ASSERT(result->ndata == ndata*int64_t(result->loss.size()) && "varying batch size not supported");
+    result->ndata += ndata;
+
+    GGML_ASSERT(ggml_is_scalar(opt_ctx->loss));
+    GGML_ASSERT(opt_ctx->loss->type == GGML_TYPE_F32);
+    float loss;
+    ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
+    result->loss.push_back(loss);
+
+    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
+    std::vector<int32_t> pred(ndata);
+    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
+    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+
+    if (!opt_ctx->labels || result->ncorrect < 0) {
+        result->ncorrect = -1;
+        return;
+    }
+
+    GGML_ASSERT(ggml_is_scalar(opt_ctx->ncorrect));
+    GGML_ASSERT(opt_ctx->ncorrect->type == GGML_TYPE_I64);
+    int64_t ncorrect;
+    ggml_backend_tensor_get(opt_ctx->ncorrect, &ncorrect, 0, ggml_nbytes(opt_ctx->ncorrect));
+    result->ncorrect += ncorrect;
+}
+
+void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
+}
+
+void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
+    if (opt_ctx->opt_period == 1) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        return;
+    }
+
+    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+    if (opt_i_next == 0) {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
+        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
+    } else {
+        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
+    }
+    opt_ctx->opt_i = opt_i_next;
+}
+
+// ====== High-Level Functions ======
+
+void ggml_opt_epoch(
+        ggml_opt_context_t      opt_ctx,
+        ggml_opt_dataset_t      dataset,
+        ggml_opt_result_t       result_train,
+        ggml_opt_result_t       result_eval,
+        int64_t                 idata_split,
+        ggml_opt_epoch_callback callback_train,
+        ggml_opt_epoch_callback callback_eval) {
+    struct ggml_tensor * inputs = ggml_opt_inputs(opt_ctx);
+    struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
+    struct ggml_tensor * data   = ggml_opt_dataset_data(dataset);
+    GGML_ASSERT(data->ne[0] == inputs->ne[0]);
+
+    const int64_t ndata       =   data->ne[1];
+    const int64_t ndata_batch = inputs->ne[1];
+
+    GGML_ASSERT(data->ne[1] % inputs->ne[1] == 0);
+    const int64_t nbatches = ndata/ndata_batch;
+
+    idata_split = idata_split < 0 ? ndata : idata_split;
+    GGML_ASSERT(idata_split % ndata_batch == 0);
+    const int64_t ibatch_split = idata_split / ndata_batch;
+
+    int64_t ibatch = 0;
+    int64_t t_loop_start = ggml_time_us();
+    for (; ibatch < ibatch_split; ++ibatch) {
+        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
+        ggml_opt_forward_backward(opt_ctx, result_train);
+        if (callback_train) {
+            callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
+        }
+    }
+    t_loop_start = ggml_time_us();
+    for (; ibatch < nbatches; ++ibatch) {
+        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
+        ggml_opt_forward(opt_ctx, result_eval);
+        if (callback_eval) {
+            callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
+        }
+    }
+}
+
+void ggml_opt_epoch_callback_progress_bar(
+        bool               train,
+        ggml_opt_context_t opt_ctx,
+        ggml_opt_dataset_t dataset,
+        ggml_opt_result_t  result,
+        int64_t            ibatch,
+        int64_t            ibatch_max,
+        int64_t            t_start_us) {
+    fprintf(stderr, "%s[", train ? "train: " : "val:   ");
+
+    constexpr int64_t bar_length = 25;
+    for (int64_t j = 0; j < bar_length; ++j) {
+        const int64_t ibatch_j = ibatch_max * j/bar_length;
+        if (ibatch_j < ibatch) {
+            fprintf(stderr, "=");
+        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
+            fprintf(stderr, ">");
+        } else {
+            fprintf(stderr, " ");
+        }
+    }
+
+    const int64_t batch_size = ggml_opt_inputs(opt_ctx)->ne[1];
+    const int64_t idata      = ibatch*batch_size;
+    const int64_t idata_max  = ibatch_max*batch_size;
+
+    double loss;
+    double loss_unc;
+    ggml_opt_result_loss(result, &loss, &loss_unc);
+
+    double accuracy;
+    double accuracy_unc;
+    ggml_opt_result_accuracy(result, &accuracy, &accuracy_unc);
+
+    const int64_t t_ibatch_us = ggml_time_us() - t_start_us;
+    int64_t t_ibatch_s = t_ibatch_us / 1000000;
+    const int64_t t_ibatch_h = t_ibatch_s / 3600;
+    t_ibatch_s -= t_ibatch_h * 3600;
+    const int64_t t_ibatch_m = t_ibatch_s / 60;
+    t_ibatch_s -= t_ibatch_m * 60;
+
+    const int64_t t_eta_us = t_ibatch_us * (ibatch_max - ibatch)/ibatch;
+    int64_t t_eta_s = t_eta_us / 1000000;
+    const int64_t t_eta_h = t_eta_s / 3600;
+    t_eta_s -= t_eta_h * 3600;
+    const int64_t t_eta_m = t_eta_s / 60;
+    t_eta_s -= t_eta_m * 60;
+
+    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
+            idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
+            t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
+    if (ibatch == ibatch_max) {
+        fprintf(stderr, "\n");
+    }
+    fflush(stderr);
+
+    GGML_UNUSED(dataset);
+}
+
+void ggml_opt_fit(
+        ggml_backend_sched_t            backend_sched,
+        ggml_context                  * ctx_compute,
+        ggml_tensor                   * inputs,
+        ggml_tensor                   * outputs,
+        ggml_opt_dataset_t              dataset,
+        enum ggml_opt_loss_type         loss_type,
+        ggml_opt_get_optimizer_params   get_opt_pars,
+        int64_t                         nepoch,
+        int64_t                         nbatch_logical,
+        float                           val_split,
+        bool                            silent) {
+    ggml_time_init();
+    const int64_t t_start_us = ggml_time_us();
+
+    const int64_t ndata           = ggml_opt_dataset_data(dataset)->ne[1];
+    const int64_t nbatch_physical = inputs->ne[1];
+    GGML_ASSERT(ndata          % nbatch_logical  == 0);
+    GGML_ASSERT(nbatch_logical % nbatch_physical == 0);
+
+    const int64_t opt_period       = nbatch_logical / nbatch_physical;
+    const int64_t nbatches_logical = ndata / nbatch_logical;
+
+    GGML_ASSERT(val_split >= 0.0f);
+    GGML_ASSERT(val_split <  1.0f);
+    const int64_t ibatch_split = int64_t(((1.0f - val_split) * nbatches_logical)) * opt_period; // train <-> val split index (physical)
+    const int64_t idata_split  = ibatch_split * nbatch_physical;
+
+    int64_t epoch = 1;
+
+    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    params.opt_period      = opt_period;
+    params.get_opt_pars    = get_opt_pars;
+    params.get_opt_pars_ud = &epoch;
+    ggml_opt_context_t opt_ctx = ggml_opt_init(params);
+
+    // Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
+    if (nbatch_logical < ndata) {
+        ggml_opt_dataset_shuffle(opt_ctx, dataset, -1); // Shuffle all data (train + validation).
+    }
+
+    ggml_opt_result_t result_train = ggml_opt_result_init();
+    ggml_opt_result_t result_val   = ggml_opt_result_init();
+
+    ggml_opt_epoch_callback epoch_callback = silent ? nullptr : ggml_opt_epoch_callback_progress_bar;
+
+    for (; epoch <= nepoch; ++epoch) {
+        if (nbatch_logical < idata_split) {
+            ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split);
+        }
+
+        ggml_opt_result_reset(result_train);
+        ggml_opt_result_reset(result_val);
+
+        if (!silent) {
+            fprintf(stderr, "%s: epoch %04" PRId64 "/%04" PRId64 ":\n", __func__, epoch, nepoch);
+        }
+        ggml_opt_epoch(opt_ctx, dataset, result_train, result_val, idata_split, epoch_callback, epoch_callback);
+        if (!silent) {
+            fprintf(stderr, "\n");
+        }
+    }
+
+    if (!silent) {
+        int64_t t_total_s = (ggml_time_us() - t_start_us) / 1000000;
+        const int64_t t_total_h = t_total_s / 3600;
+        t_total_s -= t_total_h * 3600;
+        const int64_t t_total_m = t_total_s / 60;
+        t_total_s -= t_total_m * 60;
+        fprintf(stderr, "%s: training took %02" PRId64 ":%02" PRId64 ":%02" PRId64 "\n", __func__, t_total_h, t_total_m, t_total_s);
+    }
+
+    ggml_opt_free(opt_ctx);
+    ggml_opt_result_free(result_train);
+    ggml_opt_result_free(result_val);
+}
diff --git a/llama/ggml-quants.c b/ml/backend/ggml/ggml/src/ggml-quants.c
similarity index 99%
rename from llama/ggml-quants.c
rename to ml/backend/ggml/ggml/src/ggml-quants.c
index 6f824d420..7918388ae 100644
--- a/llama/ggml-quants.c
+++ b/ml/backend/ggml/ggml/src/ggml-quants.c
@@ -1,35 +1,9 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
 #include "ggml-quants.h"
 #include "ggml-impl.h"
-#include "ggml-cpu-impl.h"
+#include "ggml-cpu/ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 
 #include <math.h>
diff --git a/llama/ggml-quants.h b/ml/backend/ggml/ggml/src/ggml-quants.h
similarity index 87%
rename from llama/ggml-quants.h
rename to ml/backend/ggml/ggml/src/ggml-quants.h
index cf518ba08..d09173e11 100644
--- a/llama/ggml-quants.h
+++ b/ml/backend/ggml/ggml/src/ggml-quants.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #define GGML_COMMON_DECL_C
diff --git a/ml/backend/ggml/ggml/src/ggml-threading.cpp b/ml/backend/ggml/ggml/src/ggml-threading.cpp
new file mode 100644
index 000000000..25a19eedb
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-threading.cpp
@@ -0,0 +1,12 @@
+#include "ggml-threading.h"
+#include <mutex>
+
+std::mutex ggml_critical_section_mutex;
+
+void ggml_critical_section_start() {
+    ggml_critical_section_mutex.lock();
+}
+
+void ggml_critical_section_end(void) {
+    ggml_critical_section_mutex.unlock();
+}
diff --git a/ml/backend/ggml/ggml/src/ggml-threading.h b/ml/backend/ggml/ggml/src/ggml-threading.h
new file mode 100644
index 000000000..dec2c8840
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-threading.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+GGML_API void ggml_critical_section_start(void);
+GGML_API void ggml_critical_section_end(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llama/ggml.c b/ml/backend/ggml/ggml/src/ggml.c
similarity index 99%
rename from llama/ggml.c
rename to ml/backend/ggml/ggml/src/ggml.c
index 8d442e08a..7ffcd907d 100644
--- a/llama/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
diff --git a/ml/backend/ggml/ggml/src/ggml.go b/ml/backend/ggml/ggml/src/ggml.go
new file mode 100644
index 000000000..7cf40e705
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -0,0 +1,81 @@
+package ggml
+
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
+// #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu
+// #cgo windows LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++
+// #include <stdlib.h>
+// #include "ggml-backend.h"
+// extern void sink(int level, char *text, void *user_data);
+import "C"
+
+import (
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"unsafe"
+
+	_ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu"
+)
+
+func init() {
+	C.ggml_log_set((C.ggml_log_callback)(C.sink), nil)
+}
+
+//export sink
+func sink(level C.int, text *C.char, _ unsafe.Pointer) {
+	msg := strings.TrimSpace(C.GoString(text))
+	switch level {
+	case C.GGML_LOG_LEVEL_DEBUG:
+		slog.Debug(msg)
+	case C.GGML_LOG_LEVEL_INFO:
+		slog.Info(msg)
+	case C.GGML_LOG_LEVEL_WARN:
+		slog.Warn(msg)
+	case C.GGML_LOG_LEVEL_ERROR:
+		slog.Error(msg)
+	}
+}
+
+var OnceLoad = sync.OnceFunc(func() {
+	var lib struct{ name, defaultValue string }
+	switch runtime.GOOS {
+	case "darwin", "linux":
+		lib.name = "LD_LIBRARY_PATH"
+		lib.defaultValue = "/usr/local/lib:/usr/lib"
+	case "windows":
+		lib.name = "PATH"
+		lib.defaultValue = "."
+	default:
+		return
+	}
+
+	paths, ok := os.LookupEnv(lib.name)
+	if !ok {
+		paths = lib.defaultValue
+	}
+
+	if runtime.GOOS == "darwin" {
+		if _, ok := os.LookupEnv("DYLD_LIBRARY_PATH"); !ok {
+			os.Setenv("DYLD_LIBRARY_PATH", paths)
+		}
+	}
+
+	split := filepath.SplitList(paths)
+	visited := make(map[string]struct{}, len(split))
+	for _, path := range split {
+		abspath, _ := filepath.Abs(path)
+		if _, ok := visited[abspath]; !ok {
+			func() {
+				cpath := C.CString(path)
+				defer C.free(unsafe.Pointer(cpath))
+				C.ggml_backend_load_all_from_path(cpath)
+			}()
+
+			visited[abspath] = struct{}{}
+		}
+	}
+})
diff --git a/ml/backend/ggml/ggml/src/ggml_darwin_arm64.go b/ml/backend/ggml/ggml/src/ggml_darwin_arm64.go
new file mode 100644
index 000000000..beffa64e2
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml_darwin_arm64.go
@@ -0,0 +1,10 @@
+package ggml
+
+// #cgo CPPFLAGS: -DGGML_USE_METAL -DGGML_USE_BLAS
+// #cgo LDFLAGS: -framework Foundation
+import "C"
+
+import (
+	_ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-blas"
+	_ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-metal"
+)
diff --git a/ml/backend/ggml/ggml_debug.go b/ml/backend/ggml/ggml_debug.go
new file mode 100644
index 000000000..9ddb2718c
--- /dev/null
+++ b/ml/backend/ggml/ggml_debug.go
@@ -0,0 +1,6 @@
+//go:build debug
+
+package ggml
+
+// #cgo CPPFLAGS: -DOLLAMA_DEBUG
+import "C"
diff --git a/runners/common.go b/runners/common.go
deleted file mode 100644
index 11279ed44..000000000
--- a/runners/common.go
+++ /dev/null
@@ -1,207 +0,0 @@
-package runners
-
-import (
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"slices"
-	"strings"
-	"sync"
-
-	"golang.org/x/sys/cpu"
-
-	"github.com/ollama/ollama/envconfig"
-)
-
-var (
-	runnersDir = ""
-	once       = sync.Once{}
-)
-
-type CPUCapability uint32
-
-// Override at build time when building base GPU runners
-// var GPURunnerCPUCapability = CPUCapabilityAVX
-
-const (
-	CPUCapabilityNone CPUCapability = iota
-	CPUCapabilityAVX
-	CPUCapabilityAVX2
-	// TODO AVX512
-)
-
-func (c CPUCapability) String() string {
-	switch c {
-	case CPUCapabilityAVX:
-		return "avx"
-	case CPUCapabilityAVX2:
-		return "avx2"
-	default:
-		return "no vector extensions"
-	}
-}
-
-func GetCPUCapability() CPUCapability {
-	if cpu.X86.HasAVX2 {
-		return CPUCapabilityAVX2
-	}
-	if cpu.X86.HasAVX {
-		return CPUCapabilityAVX
-	}
-	// else LCD
-	return CPUCapabilityNone
-}
-
-// Return the location where runners were located
-// empty string indicates only builtin is present
-func Locate() string {
-	once.Do(locateRunnersOnce)
-	return runnersDir
-}
-
-// searches for runners in a prioritized set of locations
-// 1. local build, with executable at the top of the tree
-// 2. lib directory relative to executable
-func locateRunnersOnce() {
-	exe, err := os.Executable()
-	if err != nil {
-		slog.Debug("runner locate", "error", err)
-	}
-
-	paths := []string{
-		filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
-		filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
-		filepath.Join(filepath.Dir(exe), "lib", "ollama", "runners"),
-	}
-	for _, path := range paths {
-		if _, err := os.Stat(path); err == nil {
-			runnersDir = path
-			slog.Debug("runners located", "dir", runnersDir)
-			return
-		}
-	}
-	// Fall back to built-in
-	slog.Debug("no dynamic runners detected, using only built-in")
-	runnersDir = ""
-}
-
-// Return the well-known name of the builtin runner for the given platform
-func BuiltinName() string {
-	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
-		return "metal"
-	}
-	return "cpu"
-}
-
-// directory names are the name of the runner and may contain an optional
-// variant prefixed with '_' as the separator. For example, "cuda_v11" and
-// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
-// lowest common denominator
-func GetAvailableServers() map[string]string {
-	once.Do(locateRunnersOnce)
-
-	servers := make(map[string]string)
-	exe, err := os.Executable()
-	if err == nil {
-		servers[BuiltinName()] = exe
-	}
-
-	if runnersDir == "" {
-		return servers
-	}
-
-	// glob runnersDir for files that start with ollama_
-	pattern := filepath.Join(runnersDir, "*", "ollama_*")
-
-	files, err := filepath.Glob(pattern)
-	if err != nil {
-		slog.Debug("could not glob", "pattern", pattern, "error", err)
-		return nil
-	}
-
-	for _, file := range files {
-		slog.Debug("availableServers : found", "file", file)
-		runnerName := filepath.Base(filepath.Dir(file))
-		// Special case for our GPU runners - if compiled with standard AVX flag
-		// detect incompatible system
-		// Custom builds will omit this and its up to the user to ensure compatibility
-		parsed := strings.Split(runnerName, "_")
-		if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
-			slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
-			continue
-		}
-		servers[runnerName] = file
-	}
-
-	return servers
-}
-
-// serversForGpu returns a list of compatible servers give the provided GPU library/variant
-func ServersForGpu(requested string) []string {
-	// glob workDir for files that start with ollama_
-	availableServers := GetAvailableServers()
-
-	// Short circuit if the only option is built-in
-	if _, ok := availableServers[BuiltinName()]; ok && len(availableServers) == 1 {
-		return []string{BuiltinName()}
-	}
-
-	bestCPUVariant := GetCPUCapability()
-	requestedLib := strings.Split(requested, "_")[0]
-	servers := []string{}
-
-	// exact match first
-	for a := range availableServers {
-		short := a
-		parsed := strings.Split(a, "_")
-		if len(parsed) == 3 {
-			// Strip off optional _avx for comparison
-			short = parsed[0] + "_" + parsed[1]
-		}
-		if a == requested || short == requested {
-			servers = []string{a}
-		}
-	}
-
-	// If no exact match, then try without variant
-	if len(servers) == 0 {
-		alt := []string{}
-		for a := range availableServers {
-			if requestedLib == strings.Split(a, "_")[0] && a != requested {
-				alt = append(alt, a)
-			}
-		}
-		slices.Sort(alt)
-		servers = append(servers, alt...)
-	}
-
-	// Finally append the best CPU option if found, then builtin
-	if bestCPUVariant != CPUCapabilityNone {
-		for cmp := range availableServers {
-			if cmp == "cpu_"+bestCPUVariant.String() {
-				servers = append(servers, cmp)
-				break
-			}
-		}
-	}
-	servers = append(servers, BuiltinName())
-	return servers
-}
-
-// Return the optimal server for this CPU architecture
-func ServerForCpu() string {
-	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
-		return BuiltinName()
-	}
-	variant := GetCPUCapability()
-	availableServers := GetAvailableServers()
-	if variant != CPUCapabilityNone {
-		for cmp := range availableServers {
-			if cmp == "cpu_"+variant.String() {
-				return cmp
-			}
-		}
-	}
-	return BuiltinName()
-}
diff --git a/scripts/build.sh b/scripts/build.sh
deleted file mode 100644
index a50dc7dbf..000000000
--- a/scripts/build.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/sh
-
-set -eu
-
-usage() {
-    echo "usage: $(basename $0) VERSION"
-    exit 1
-}
-
-[ "$#" -eq 1 ] || usage
-
-export VERSION="$1"
-
-# build universal MacOS binary
-sh $(dirname $0)/build_darwin.sh
-
-# # build arm64 and amd64 Linux binaries
-sh $(dirname $0)/build_linux.sh
-
-# # build arm64 and amd64 Docker images
-sh $(dirname $0)/build_docker.sh
diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh
index cbf6f61d9..7e586f5ff 100755
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -2,55 +2,92 @@
 
 set -e
 
-. $(dirname $0)/env.sh
+status() { echo >&2 ">>> $@"; }
+usage() {
+    echo "usage: $(basename $0) [build [sign]]"
+    exit 1
+}
 
-mkdir -p dist
+export VERSION=${VERSION:-$(git describe --tags --dirty)}
+export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${VERSION#v}\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+export CGO_CPPFLAGS='-mmacosx-version-min=11.3'
 
-# These require Xcode v13 or older to target MacOS v11
-# If installed to an alternate location use the following to enable
-# export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
-# export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
-export CGO_CFLAGS=-mmacosx-version-min=11.3
-export CGO_CXXFLAGS=-mmacosx-version-min=11.3
-export CGO_LDFLAGS=-mmacosx-version-min=11.3
+ARCHS="arm64 amd64"
+while getopts "a:h" OPTION; do
+    case $OPTION in
+        a) ARCHS=$OPTARG ;;
+        h) usage ;;
+    esac
+done
 
-rm -rf llama/build dist/darwin-*
+shift $(( $OPTIND - 1 ))
 
-# Generate the universal ollama binary for stand-alone usage: metal + avx
-echo "Building binary"
-echo "Building darwin arm64"
-GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
-echo "Building darwin amd64 with AVX enabled"
-GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist_exe
-lipo -create -output dist/ollama-darwin dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
+_build_darwin() {
+    for ARCH in $ARCHS; do
+        status "Building darwin $ARCH"
+        INSTALL_PREFIX=dist/darwin-$ARCH/
+        GOOS=darwin GOARCH=$ARCH CGO_ENABLED=1 go build -o $INSTALL_PREFIX .
 
-# sign the binary and rename it
-if [ -n "$APPLE_IDENTITY" ]; then
-    codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama-darwin
-else
-    echo "WARNING: Skipping code signing - set APPLE_IDENTITY"
+        if [ "$ARCH" = "amd64" ]; then
+            status "Building darwin $ARCH dynamic backends"
+            cmake -B build/darwin-$ARCH \
+                -DCMAKE_OSX_ARCHITECTURES=x86_64 \
+                -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3
+            cmake --build build/darwin-$ARCH --target ggml-cpu -j
+            install build/darwin-$ARCH/lib/ollama/*.{dylib,so} $INSTALL_PREFIX
+        fi
+    done
+}
+
+_sign_darwin() {
+    status "Creating universal binary..."
+    lipo -create -output dist/darwin/ollama dist/darwin/*/ollama
+
+    if [ -z "$APPLE_IDENTITY" ]; then
+        status "No APPLE_IDENTITY set, skipping code signing"
+        return
+    fi
+
+    for F in dist/darwin/ollama dist/darwin/amd64/lib*; do
+        codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime $F
+    done
+
+    # create a temporary zip for notarization
+    TEMP=$(mktemp -u).zip
+    ditto -c -k --keepParent dist/darwin/ollama "$TEMP"
+    xcrun notarytool submit dist/darwin/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
+    rm -f "$TEMP"
+
+    # create a universal tarball
+    tar -cf dist/ollama-darwin.tar --strip-components 2 dist/darwin/ollama
+    tar -rf dist/ollama-darwin.tar --strip-components 3 dist/darwin/amd64/lib*
+    gzip -9vc <dist/ollama-darwin.tar >dist/ollama-darwin.tgz
+}
+
+_build_macapp() {
+    # build and optionally sign the mac app
+    npm install --prefix macapp
+    if [ -n "$APPLE_IDENTITY" ]; then
+        npm run --prefix macapp make:sign
+    else
+        npm run --prefix macapp make
+    fi
+
+    mv ./macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
+}
+
+if [ "$#" -eq 0 ]; then
+    _build_darwin
+    _sign_darwin
+    _build_macapp
+    exit 0
 fi
-ditto -c -k --keepParent dist/ollama-darwin dist/temp.zip
-if [ -n "$APPLE_IDENTITY" ]; then
-    xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID
-fi
-rm -f dist/temp.zip
-
-# Build the app bundle
-echo "Building app"
-echo "Building darwin amd64 with runners"
-rm dist/darwin-amd64/bin/ollama
-GOOS=darwin ARCH=amd64 GOARCH=amd64 make -j 8 dist
-
-# Generate the universal ollama binary for the app bundle: metal + no-avx
-lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
-
-# build and optionally sign the mac app
-npm install --prefix macapp
-if [ -n "$APPLE_IDENTITY" ]; then
-    npm run --prefix macapp make:sign
-else 
-    npm run --prefix macapp make
-fi
-cp macapp/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip
 
+for CMD in "$@"; do
+    case $CMD in
+        build) _build_darwin ;;
+        sign) _sign_darwin ;;
+        macapp) _build_macapp ;;
+        *) usage ;;
+    esac
+done
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 894d9dd2e..a0c3d2f00 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -18,7 +18,7 @@ docker buildx build \
         --output type=local,dest=./dist/ \
         --platform=${PLATFORM} \
         ${OLLAMA_COMMON_BUILD_ARGS} \
-        --target dist \
+        --target archive \
         -f Dockerfile \
         .
 
@@ -26,4 +26,4 @@ docker buildx build \
 if echo $PLATFORM | grep "," > /dev/null ; then 
         mv -f ./dist/linux_*64/ollama* ./dist/
         rmdir ./dist/linux_*64
-fi
\ No newline at end of file
+fi
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 0a69c60c6..30cf98276 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -80,18 +80,61 @@ function checkEnv() {
 
 function buildOllama() {
     if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
-        write-host "Building ollama runners"
         Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        & make -j 12 dist
+        New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
+
+
+        # Default first, then conditionall ROCm and cuda v11
+        write-host "Building Default native backend libraries"
+         $env:CMAKE_GENERATOR="ninja"
+        & cmake --preset Default
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        & cmake --build --preset Default -j 12
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        & cmake --install build -j 12
+        
+        # TODO - add steps for v11 and ROCm
+        #
+        # if ("$script:CUDA_DIRS".Contains("v11") -and "$script:CUDA_DIRS".Contains("v12")) {
+        #     # We assume the default is v12, so override for v11
+        #     $origCUDA_PATH=$env:CUDA_PATH
+        #     $hashEnv = @{}
+        #     Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
+        #     $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
+        #     write-host "$v11"
+        #     # $env:CUDA_PATH=$hashEnv[$v11]
+        #     # $env:CUDACXX=$hashEnv[$v11]+"\bin\nvcc.exe"
+        #     $env:CUDAToolkit_ROOT=$hashEnv[$v11]
+        #     # ls env:
+        #     write-host "Building CUDA v11 backend libraries"
+        #     & cmake --preset "CUDA 11"
+        #     $env:CUDA_PATH=$origCUDA_PATH
+        #     exit(1)
+        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        #     # & cmake --build --preset "CUDA 11" -j 12
+        #     # if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        # }
+
+        # if ($env:HIP_PATH) {
+        #     write-host "Building ROCm backend libraries"
+        #     $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
+        #     $env:HIP_PLATFORM="amd"
+        #     $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+        #     & cmake --preset "ROCm"
+        #     $env:HIPCXX=""
+        #     $env:HIP_PLATFORM=""
+        #     $env:CMAKE_PREFIX_PATH=""
+        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        #     & cmake --build --preset "ROCm" -j 12
+        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        # }
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
     }
     write-host "Building ollama CLI"
     & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
-    cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
+    cp .\ollama.exe "${script:DIST_DIR}\"
 }
 
 function buildApp() {
diff --git a/scripts/fast.sh b/scripts/fast.sh
deleted file mode 100755
index 8fd1e9084..000000000
--- a/scripts/fast.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#/bin/sh
-
-# Wrapper script to speed up builds by disabling some permutations and reduce compatibility matrix
-# Don't use for release builds, but suitable for local developer iteration
-
-# Only build cuda v12
-export OLLAMA_SKIP_CUDA_11_GENERATE=1
-# Major versions only
-export CUDA_V12_ARCHITECTURES="60;70;80;90"
-# Skip ROCm
-export OLLAMA_SKIP_ROCM_GENERATE=1
-# Disable various less common quants and fattn
-export OLLAMA_FAST_BUILD=1
-
-if [ $# -ne 1 ] ; then
-    echo "Usage: ./scripts/fast.sh <build_script>"
-    exit 1
-fi
-
-exec $1
\ No newline at end of file
diff --git a/scripts/publish.sh b/scripts/publish.sh
deleted file mode 100755
index 5bf15dcbd..000000000
--- a/scripts/publish.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-# Set your variables here.
-REPO="jmorganca/ollama"
-
-# Check if VERSION is set
-if [[ -z "${VERSION}" ]]; then
-  echo "VERSION is not set. Please set the VERSION environment variable."
-  exit 1
-fi
-
-OS=$(go env GOOS)
-
-./script/build_${OS}.sh
-
-# Create a new tag if it doesn't exist.
-if ! git rev-parse v$VERSION >/dev/null 2>&1; then
-  git tag v$VERSION
-fi
-
-git push origin v$VERSION
-
-# Create a new release.
-gh release create -p v$VERSION -t v$VERSION
-
-# Upload the zip file.
-gh release upload v$VERSION ./dist/* --clobber
diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh
deleted file mode 100644
index d0cadd453..000000000
--- a/scripts/rh_linux_deps.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/sh
-
-# Script for common Dockerfile dependency installation in redhat linux based images
-
-set -ex
-set -o pipefail
-MACHINE=$(uname -m)
-
-if grep -i "centos" /etc/system-release >/dev/null; then
-    # As of 7/1/2024 mirrorlist.centos.org has been taken offline, so adjust accordingly
-    sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-    sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-    sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-
-    # Centos 7 derivatives have too old of a git version to run our generate script
-    # uninstall and ignore failures
-    yum remove -y git
-    yum -y install epel-release centos-release-scl
-
-    # The release packages reinstate the mirrors, undo that again
-    sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-    sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-    sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-
-    yum -y install dnf
-    if [ "${MACHINE}" = "x86_64" ]; then
-        yum -y install https://repo.ius.io/ius-release-el7.rpm
-        dnf install -y git236
-    else
-        dnf install -y rh-git227-git
-        ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
-    fi
-    dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz findutils
-elif grep -i "rocky" /etc/system-release >/dev/null; then
-    # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
-    cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
-[vault]
-name=Rocky Vault
-baseurl=https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/
-gpgcheck=1
-enabled=1
-countme=1
-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
-EOF
-    dnf install -y git \
-        gcc-toolset-10-gcc-10.2.1-8.2.el8 \
-        gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
-        findutils \
-        yum-utils \
-        pigz
-else
-    echo "ERROR Unexpected distro"
-    exit 1
-fi
-
-if [ "${MACHINE}" = "x86_64" ] ; then
-    curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
-    mv /tmp/ccache /usr/local/bin/
-else
-    yum -y install epel-release
-    yum install -y ccache
-fi
-
-if [ -n "${CMAKE_VERSION}" ]; then
-    curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
-fi
-
-if [ -n "${GOLANG_VERSION}" ]; then
-    if [ "${MACHINE}" = "x86_64" ]; then
-        GO_ARCH="amd64"
-    else
-        GO_ARCH="arm64"
-    fi
-    mkdir -p /usr/local
-    curl -s -L https://dl.google.com/go/go${GOLANG_VERSION}.linux-${GO_ARCH}.tar.gz | tar xz -C /usr/local
-    ln -s /usr/local/go/bin/go /usr/local/bin/go
-    ln -s /usr/local/go/bin/gofmt /usr/local/bin/gofmt
-fi
diff --git a/server/routes.go b/server/routes.go
index c2ec360ab..5a4bb485c 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -33,7 +33,6 @@ import (
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/model/mllama"
 	"github.com/ollama/ollama/openai"
-	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@@ -1259,14 +1258,6 @@ func Serve(ln net.Listener) error {
 		done()
 	}()
 
-	// Locate and log what runners are present at startup
-	var runnerNames []string
-	for v := range runners.GetAvailableServers() {
-		runnerNames = append(runnerNames, v)
-	}
-	slog.Info("Dynamic LLM libraries", "runners", runnerNames)
-	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-
 	s.sched.Run(schedCtx)
 
 	// At startup we retrieve GPU information so we can get log messages before loading a model