From d0b32def60b413407ddf4b4b063ba105a1ef2f92 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 26 Jun 2025 21:49:35 -0700
Subject: [PATCH] skip quantizing per_layer_token_embd (#11207)

this tensor isn't compatible with cuda when quantized to q4_K so skip it
---
 server/quantization.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/server/quantization.go b/server/quantization.go
index e57e8a4da..10175a351 100644
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -231,6 +231,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
 	// do not quantize relative position bias (T5)
 	quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")
 
+	quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight")
+
 	newType := fsggml.TensorType(t.Kind)
 	if quantize {
 		// get more optimal quantization type based on the tensor shape, layer, etc.