mirror of
https://github.com/ollama/ollama.git
synced 2025-08-25 22:41:18 +02:00
server: improve tensor quantization fallback logic (#10806)
Fall back to alternative quantization types when a tensor's dimensions aren't divisible by the block size required for the original desired quantization type. If retried quantization types fail, the system ultimately falls back to F16 (half-precision floating point) which has a block size of 1 and can handle any tensor dimension.
This commit is contained in:
@@ -120,14 +120,30 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
|
||||
|
||||
if newType.IsQuantized() {
|
||||
nx := shape[0]
|
||||
ny := uint64(1)
|
||||
if len(shape) > 1 {
|
||||
ny = shape[1]
|
||||
}
|
||||
qk_k := newType.BlockSize()
|
||||
|
||||
// Check if first dimension is divisible by block size
|
||||
if nx%qk_k != 0 {
|
||||
slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
|
||||
newType = fsggml.TensorTypeF16
|
||||
// Store the original type for logging
|
||||
originalType := newType
|
||||
|
||||
// Select appropriate fallback based on original type
|
||||
switch newType {
|
||||
case fsggml.TensorTypeQ4_K:
|
||||
newType = fsggml.TensorTypeQ5_0
|
||||
case fsggml.TensorTypeQ5_K:
|
||||
newType = fsggml.TensorTypeQ5_1
|
||||
case fsggml.TensorTypeQ6_K:
|
||||
newType = fsggml.TensorTypeQ8_0
|
||||
}
|
||||
|
||||
// Final check - if still incompatible, fall back to F16
|
||||
if nx%newType.BlockSize() != 0 {
|
||||
newType = fsggml.TensorTypeF16
|
||||
}
|
||||
|
||||
slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
|
||||
nx, qk_k, originalType.String(), newType.String()))
|
||||
}
|
||||
}
|
||||
return newType
|
||||
|
Reference in New Issue
Block a user