From fbe6ae285a23baddb14c5bbce26d4fcb837503e4 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Thu, 22 May 2025 10:48:08 -0700 Subject: [PATCH] server: improve tensor quantization fallback logic (#10806) Fall back to alternative quantization types when a tensor's dimensions aren't divisible by the block size required for the original desired quantization type. If retried quantization types fail, the system ultimately falls back to F16 (half-precision floating point) which has a block size of 1 and can handle any tensor dimension. --- server/quantization.go | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/server/quantization.go b/server/quantization.go index adfc948ecf..e57e8a4dad 100644 --- a/server/quantization.go +++ b/server/quantization.go @@ -120,14 +120,30 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType if newType.IsQuantized() { nx := shape[0] - ny := uint64(1) - if len(shape) > 1 { - ny = shape[1] - } qk_k := newType.BlockSize() + + // Check if first dimension is divisible by block size if nx%qk_k != 0 { - slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String())) - newType = fsggml.TensorTypeF16 + // Store the original type for logging + originalType := newType + + // Select appropriate fallback based on original type + switch newType { + case fsggml.TensorTypeQ4_K: + newType = fsggml.TensorTypeQ5_0 + case fsggml.TensorTypeQ5_K: + newType = fsggml.TensorTypeQ5_1 + case fsggml.TensorTypeQ6_K: + newType = fsggml.TensorTypeQ8_0 + } + + // Final check - if still incompatible, fall back to F16 + if nx%newType.BlockSize() != 0 { + newType = fsggml.TensorTypeF16 + } + + slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s", + nx, qk_k, originalType.String(), newType.String())) } } return newType