Llama cpp bump (df1b612): granite docling / mamba2 optimizations / multimodal encoding fixes (#12552)

* feat: Bump llama.cpp to df1b612 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(mtmd): Correctly encode text chunks during mtmd tokenization There can be text chunks that appear interspersed with the image embeddings that contain template delimiter tokens for some models. These need to be correctly translated to text tokens. Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * tests: Use MtmdChunk in image_test Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * style: Fix unnecessary conversion linting Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix(ggml): Revert changes to ggml_hip.cpp These changes were done largely by our code assistant and are likely wrong Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Revert changes in mem_nvml.cpp Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update sync point to 1deee0 This brings in several more optimization commits and model support for EmbeddingGemma Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Update patches for 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: sync for bump to 1deee0 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Bad patch updates with errant `+` Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Bump llama.cpp/ggml to 7049736 Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: format-patches after latest bump Branch: LlamaCPPBump-GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-11-11 07:48:10 +01:00 · 2025-10-13 16:26:18 -06:00
parent e638f2acb6
commit 4987f13d34
99 changed files with 4580 additions and 2323 deletions
--- a/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
+++ b/llama/patches/0024-ggml-Enable-resetting-backend-devices.patch
@@ -10,12 +10,12 @@ unused then it can be reset to free these data structures.
 ggml/include/ggml-backend.h      |  1 +
 ggml/src/ggml-backend-impl.h     |  4 ++++
 ggml/src/ggml-backend.cpp        |  8 ++++++++
- ggml/src/ggml-cuda/ggml-cuda.cu  | 17 +++++++++++++++--
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 16 +++++++++++++++-
 ggml/src/ggml-cuda/vendors/hip.h |  1 +
- 5 files changed, 29 insertions(+), 2 deletions(-)
+ 5 files changed, 29 insertions(+), 1 deletion(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index d4352663..0a2dae26 100644
+index 1ff53ed0..ba181d09 100644
 --- a/ggml/include/ggml-backend.h
 +++ b/ggml/include/ggml-backend.h
@@ -178,6 +178,7 @@ extern "C" {
@@ -27,7 +27,7 @@ index d4352663..0a2dae26 100644
     GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
     GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
 diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
-index 869dc07d..4889df79 100644
+index 3c3f22fc..43c91d9f 100644
 --- a/ggml/src/ggml-backend-impl.h
 +++ b/ggml/src/ggml-backend-impl.h
@@ -195,6 +195,10 @@ extern "C" {
@@ -61,7 +61,7 @@ index 6ef5eeaf..0b757af5 100644
     GGML_ASSERT(device);
     return device->iface.get_buffer_type(device);
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index d324bc68..531d6e27 100644
+index 811462c7..87c6c34a 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -107,6 +107,11 @@ int ggml_cuda_get_device() {
@@ -76,7 +76,7 @@ index d324bc68..531d6e27 100644
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
     cudaError_t err;
-@@ -3512,7 +3517,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
+@@ -3515,7 +3520,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     props->id          = ggml_backend_cuda_device_get_id(dev);
     props->type        = ggml_backend_cuda_device_get_type(dev);
     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
@@ -88,7 +88,7 @@ index d324bc68..531d6e27 100644
 
     bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
 #ifdef GGML_CUDA_NO_PEER_COPY
-@@ -3945,6 +3953,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
+@@ -3948,6 +3956,11 @@ static void ggml_backend_cuda_device_event_synchronize(ggml_backend_dev_t dev, g
     CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 }
 
@@ -100,7 +100,7 @@ index d324bc68..531d6e27 100644
 static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .get_name                = */ ggml_backend_cuda_device_get_name,
     /* .get_description         = */ ggml_backend_cuda_device_get_description,
-@@ -3961,6 +3974,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
+@@ -3964,6 +3977,7 @@ static const ggml_backend_device_i ggml_backend_cuda_device_interface = {
     /* .event_new               = */ ggml_backend_cuda_device_event_new,
     /* .event_free              = */ ggml_backend_cuda_device_event_free,
     /* .event_synchronize       = */ ggml_backend_cuda_device_event_synchronize,
@@ -108,19 +108,11 @@ index d324bc68..531d6e27 100644
 };
 
 // backend reg
-@@ -4076,7 +4090,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
-                 dev_ctx->device = i;
-                 dev_ctx->name = GGML_CUDA_NAME + std::to_string(i);
- 
-                ggml_cuda_set_device(i);
-                 cudaDeviceProp prop;
-                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-                 dev_ctx->description = prop.name;
 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
-index 37386afc..06f9e7c1 100644
+index 890c1036..1f06be80 100644
 --- a/ggml/src/ggml-cuda/vendors/hip.h
 +++ b/ggml/src/ggml-cuda/vendors/hip.h
-@@ -41,6 +41,7 @@
+@@ -45,6 +45,7 @@
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
 #define cudaDeviceProp hipDeviceProp_t