mirror of
https://github.com/ollama/ollama.git
synced 2025-11-10 15:37:27 +01:00
* feat: Bump llama.cpp to df1b612 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Correctly encode text chunks during mtmd tokenization There can be text chunks that appear interspersed with the image embeddings that contain template delimiter tokens for some models. These need to be correctly translated to text tokens. Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * tests: Use MtmdChunk in image_test Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Fix unnecessary conversion linting Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(ggml): Revert changes to ggml_hip.cpp These changes were done largely by our code assistant and are likely wrong Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Revert changes in mem_nvml.cpp Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update sync point to 1deee0 This brings in several more optimization commits and model support for EmbeddingGemma Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches for 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: sync for bump to 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Bad patch updates with errant `+` Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Bump llama.cpp/ggml to 7049736 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: format-patches after latest bump Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
206 lines
8.9 KiB
Diff
206 lines
8.9 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: jmorganca <jmorganca@gmail.com>
|
|
Date: Thu, 6 Jun 2024 23:55:47 -0700
|
|
Subject: [PATCH] ggml-backend: malloc and free using the same compiler
|
|
|
|
On Windows, the CUDA backend must be compiled with MSVC but generic
|
|
portions compiled with CGo use either GCC or Clang. Since
|
|
ggml_backend_buffer_t spans these two components, it can be allocated
|
|
and freed using different compilers. Specifically, it is malloced by
|
|
MSVC and freed by Clang, which can cause problems.
|
|
|
|
This moves freeing of the buffers into the backends to avoid the
|
|
problem.
|
|
---
|
|
ggml/src/ggml-backend.cpp | 9 +++++++--
|
|
ggml/src/ggml-cann/ggml-cann.cpp | 2 ++
|
|
ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
|
|
ggml/src/ggml-metal/ggml-metal.cpp | 2 ++
|
|
ggml/src/ggml-opencl/ggml-opencl.cpp | 1 +
|
|
ggml/src/ggml-rpc/ggml-rpc.cpp | 1 +
|
|
ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +++
|
|
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++
|
|
8 files changed, 21 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
|
index ff9135fe..8ba86f82 100644
|
|
--- a/ggml/src/ggml-backend.cpp
|
|
+++ b/ggml/src/ggml-backend.cpp
|
|
@@ -113,7 +113,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
|
if (buffer->iface.free_buffer != NULL) {
|
|
buffer->iface.free_buffer(buffer);
|
|
}
|
|
- delete buffer;
|
|
}
|
|
|
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
|
@@ -586,6 +585,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|
|
|
free(ctx->buffers);
|
|
free(ctx);
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
@@ -2075,6 +2075,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
GGML_ASSERT(buffer);
|
|
ggml_aligned_free(buffer->context, buffer->size);
|
|
+ delete buffer;
|
|
+}
|
|
+
|
|
+static void ggml_backend_cpu_ptr_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
@@ -2127,7 +2132,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
|
|
};
|
|
|
|
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
|
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
|
+ /* .free_buffer = */ ggml_backend_cpu_ptr_buffer_free_buffer, // ptr is not owned by the buffer but need to free the buffer itself
|
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
|
/* .init_tensor = */ NULL, // no initialization required
|
|
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
|
index ad1adba6..7d44f74f 100755
|
|
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
|
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
|
@@ -843,6 +843,7 @@ static void ggml_backend_cann_buffer_free_buffer(
|
|
ggml_backend_cann_buffer_context* ctx =
|
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
|
|
/**
|
|
@@ -1630,6 +1631,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
|
*/
|
|
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
|
ACL_CHECK(aclrtFreeHost(buffer->context));
|
|
+ delete buffer;
|
|
}
|
|
|
|
/**
|
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
index 856e9de2..c0b1e4c1 100644
|
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
@@ -567,6 +567,7 @@ struct ggml_backend_cuda_buffer_context {
|
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
|
|
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
|
@@ -822,6 +823,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
|
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
@@ -1103,6 +1105,7 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
|
|
|
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
CUDA_CHECK(cudaFreeHost(buffer->context));
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_cuda_host_malloc(size_t size) {
|
|
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
|
|
index 7afc881f..bf096227 100644
|
|
--- a/ggml/src/ggml-metal/ggml-metal.cpp
|
|
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
|
|
@@ -25,6 +25,7 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
|
|
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
|
|
|
ggml_metal_buffer_free(ctx);
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
|
|
@@ -99,6 +100,7 @@ static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t
|
|
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
|
|
|
ggml_metal_buffer_free(ctx);
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
|
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
|
index 79d21487..38c75018 100644
|
|
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
|
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
|
@@ -3212,6 +3212,7 @@ struct ggml_backend_opencl_buffer_context {
|
|
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
|
index aad48d62..a46c0f52 100644
|
|
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
|
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
|
@@ -528,6 +528,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
|
RPC_STATUS_ASSERT(status);
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
|
index 45b8c216..4ec9a592 100644
|
|
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
|
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
|
@@ -334,6 +334,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
|
ggml_sycl_set_device(ctx->device);
|
|
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
catch (sycl::exception const &exc) {
|
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
|
@@ -795,6 +796,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
|
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
@@ -1137,6 +1139,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
|
|
|
|
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_sycl_host_free(buffer->context);
|
|
+ delete buffer;
|
|
}
|
|
|
|
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
|
index 3cd89c71..ed83236f 100644
|
|
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
|
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
|
@@ -11600,6 +11600,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
|
delete ctx;
|
|
+ delete buffer;
|
|
}
|
|
|
|
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
@@ -11743,6 +11744,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
|
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
|
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
|
+ delete buffer;
|
|
}
|
|
|
|
static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|