diff --git a/llama/patches/0001-cuda.patch b/llama/patches/0001-cuda.patch deleted file mode 100644 index a766c30c5..000000000 --- a/llama/patches/0001-cuda.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Thu, 6 Jun 2024 23:55:47 -0700 -Subject: [PATCH] cuda - ---- - ggml/src/ggml-backend.cpp | 1 - - ggml/src/ggml-cuda/ggml-cuda.cu | 1 + - ggml/src/ggml-metal/ggml-metal.m | 1 + - 3 files changed, 2 insertions(+), 1 deletion(-) - -diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index dba7be33..1ca40b2c 100644 ---- a/ggml/src/ggml-backend.cpp -+++ b/ggml/src/ggml-backend.cpp -@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { - if (buffer->iface.free_buffer != NULL) { - buffer->iface.free_buffer(buffer); - } -- delete buffer; - } - - size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index ebb2ccae..b094929b 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context { - static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - delete ctx; -+ delete buffer; - } - - static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { -diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index c550142a..fd9a4e77 100644 ---- a/ggml/src/ggml-metal/ggml-metal.m -+++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) - } - - free(ctx); -+ free(buffer); - } - - static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch new file mode 100644 index 000000000..bef91881b --- /dev/null +++ b/llama/patches/0001-ggml-backend-malloc-and-free-using-the-same-compiler.patch @@ -0,0 +1,210 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: jmorganca +Date: Thu, 6 Jun 2024 23:55:47 -0700 +Subject: [PATCH] ggml-backend: malloc and free using the same compiler + +On Windows, the CUDA backend must be compiled with MSVC but generic +portions compiled with CGo use either GCC or Clang. Since +ggml_backend_buffer_t spans these two components, it can be allocated +and freed using different compilers. Specifically, it is malloced by +MSVC and freed by Clang, which can cause problems. + +This moves freeing of the buffers into the backends to avoid the +problem. +--- + ggml/src/ggml-backend.cpp | 9 +++++++-- + ggml/src/ggml-cann/ggml-cann.cpp | 2 ++ + ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++ + ggml/src/ggml-kompute/ggml-kompute.cpp | 1 + + ggml/src/ggml-metal/ggml-metal.m | 1 + + ggml/src/ggml-opencl/ggml-opencl.cpp | 1 + + ggml/src/ggml-rpc/ggml-rpc.cpp | 1 + + ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++ + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++ + 9 files changed, 21 insertions(+), 2 deletions(-) + +diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp +index dba7be33..65e150d6 100644 +--- a/ggml/src/ggml-backend.cpp ++++ b/ggml/src/ggml-backend.cpp +@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { + if (buffer->iface.free_buffer != NULL) { + buffer->iface.free_buffer(buffer); + } +- delete buffer; + } + + size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { +@@ -542,6 +541,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) + + free(ctx->buffers); + free(ctx); ++ delete buffer; + } + + static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +@@ -1865,6 +1865,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { + + static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_aligned_free(buffer->context, buffer->size); ++ delete buffer; ++} ++ ++static void ggml_backend_cpu_ptr_buffer_free_buffer(ggml_backend_buffer_t buffer) { ++ delete buffer; + } + + static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +@@ -1912,7 +1917,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { + }; + + static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { +- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed ++ /* .free_buffer = */ ggml_backend_cpu_ptr_buffer_free_buffer, // ptr is not owned by the buffer but need to free the buffer itself + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, +diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp +index d410c024..a207ab1e 100644 +--- a/ggml/src/ggml-cann/ggml-cann.cpp ++++ b/ggml/src/ggml-cann/ggml-cann.cpp +@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer( + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + delete ctx; ++ delete buffer; + } + + /** +@@ -1198,6 +1199,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf + */ + static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { + ACL_CHECK(aclrtFreeHost(buffer->context)); ++ delete buffer; + } + + /** +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index ebb2ccae..dfff21a2 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -529,6 +529,7 @@ struct ggml_backend_cuda_buffer_context { + static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + delete ctx; ++ delete buffer; + } + + static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { +@@ -783,6 +784,7 @@ struct ggml_backend_cuda_split_buffer_context { + static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + delete ctx; ++ delete buffer; + } + + static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { +@@ -1055,6 +1057,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ + + static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + CUDA_CHECK(cudaFreeHost(buffer->context)); ++ delete buffer; + } + + static void * ggml_cuda_host_malloc(size_t size) { +diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp +index 50579227..2799a0a5 100644 +--- a/ggml/src/ggml-kompute/ggml-kompute.cpp ++++ b/ggml/src/ggml-kompute/ggml-kompute.cpp +@@ -1911,6 +1911,7 @@ static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer + ggml_vk_free_memory(*memory); + } + delete memory; ++ delete buffer; + } + + static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { +diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m +index c550142a..fd9a4e77 100644 +--- a/ggml/src/ggml-metal/ggml-metal.m ++++ b/ggml/src/ggml-metal/ggml-metal.m +@@ -4350,6 +4350,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) + } + + free(ctx); ++ free(buffer); + } + + static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp +index f5906246..062e93b8 100644 +--- a/ggml/src/ggml-opencl/ggml-opencl.cpp ++++ b/ggml/src/ggml-opencl/ggml-opencl.cpp +@@ -1203,6 +1203,7 @@ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000; + static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + delete ctx; ++ delete buffer; + } + + static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { +diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp +index 97873acc..893ee0b9 100644 +--- a/ggml/src/ggml-rpc/ggml-rpc.cpp ++++ b/ggml/src/ggml-rpc/ggml-rpc.cpp +@@ -419,6 +419,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) { + bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0); + GGML_ASSERT(status); + delete ctx; ++ delete buffer; + } + + static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) { +diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp +index 792e0569..5e233e8b 100644 +--- a/ggml/src/ggml-sycl/ggml-sycl.cpp ++++ b/ggml/src/ggml-sycl/ggml-sycl.cpp +@@ -311,6 +311,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { + ggml_sycl_set_device(ctx->device); + + delete ctx; ++ delete buffer; + } + catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ +@@ -720,6 +721,7 @@ struct ggml_backend_sycl_split_buffer_context { + static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; + delete ctx; ++ delete buffer; + } + + static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { +@@ -1053,6 +1055,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ + + static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_sycl_host_free(buffer->context); ++ delete buffer; + } + + static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index abe3e790..1dad714b 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -7914,6 +7914,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + ggml_vk_destroy_buffer(ctx->dev_buffer); + delete ctx; ++ delete buffer; + } + + static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) { +@@ -8056,6 +8057,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe + static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); + ggml_vk_host_free(vk_instance.devices[0], buffer->context); ++ delete buffer; + } + + static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { diff --git a/llama/patches/0006-conditional-fattn.patch b/llama/patches/0006-conditional-fattn.patch index 63af1f5c4..97af82e20 100644 --- a/llama/patches/0006-conditional-fattn.patch +++ b/llama/patches/0006-conditional-fattn.patch @@ -8,10 +8,10 @@ Subject: [PATCH] conditional-fattn 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index b094929b..36165840 100644 +index dfff21a2..1b0d074b 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2282,9 +2282,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg +@@ -2284,9 +2284,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_ARGSORT: ggml_cuda_op_argsort(ctx, dst); break; diff --git a/llama/patches/0008-add-unpad-operator.patch b/llama/patches/0008-add-unpad-operator.patch index bfa82de2b..31e67a6e3 100644 --- a/llama/patches/0008-add-unpad-operator.patch +++ b/llama/patches/0008-add-unpad-operator.patch @@ -126,10 +126,10 @@ index 72325349..2f606d82 100644 case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 36165840..1adf08fa 100644 +index 1b0d074b..c7a957c8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2198,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg +@@ -2200,6 +2200,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_PAD: ggml_cuda_op_pad(ctx, dst); break; @@ -139,7 +139,7 @@ index 36165840..1adf08fa 100644 case GGML_OP_ARANGE: ggml_cuda_op_arange(ctx, dst); break; -@@ -3197,6 +3200,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g +@@ -3199,6 +3202,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return ggml_is_contiguous(op->src[0]); case GGML_OP_UPSCALE: case GGML_OP_PAD: diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index 1ca40b2c4..65e150d63 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -541,6 +541,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx->buffers); free(ctx); + delete buffer; } static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -1864,6 +1865,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_aligned_free(buffer->context, buffer->size); + delete buffer; +} + +static void ggml_backend_cpu_ptr_buffer_free_buffer(ggml_backend_buffer_t buffer) { + delete buffer; } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -1911,7 +1917,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = { }; static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed + /* .free_buffer = */ ggml_backend_cpu_ptr_buffer_free_buffer, // ptr is not owned by the buffer but need to free the buffer itself /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 1adf08fad..c7a957c82 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -784,6 +784,7 @@ struct ggml_backend_cuda_split_buffer_context { static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; delete ctx; + delete buffer; } static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -1056,6 +1057,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { CUDA_CHECK(cudaFreeHost(buffer->context)); + delete buffer; } static void * ggml_cuda_host_malloc(size_t size) {