mirror of
https://github.com/ollama/ollama.git
synced 2025-03-20 23:02:48 +01:00
156 lines
5.6 KiB
C
Vendored
156 lines
5.6 KiB
C
Vendored
/**
|
|
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
|
|
*
|
|
* MIT License
|
|
*
|
|
* Copyright (c) 2023-2024 The ggml authors
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in all
|
|
* copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#define GGML_COMMON_DECL_C
|
|
#include "ggml-common.h"
|
|
|
|
#include "ggml-aarch64.h"
|
|
#include "ggml-impl.h"
|
|
#include "ggml-quants.h"
|
|
#include <assert.h>
|
|
|
|
#define UNUSED GGML_UNUSED
|
|
|
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
block_q4_0x4 out;
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
out.d[i] = in[i].d;
|
|
}
|
|
|
|
const int end = QK4_0 * 2 / blck_size_interleave;
|
|
|
|
if (blck_size_interleave == 8) {
|
|
const uint64_t xor_mask = 0x8888888888888888ULL;
|
|
for (int i = 0; i < end; ++i) {
|
|
int src_id = i % 4;
|
|
int src_offset = (i / 4) * blck_size_interleave;
|
|
int dst_offset = i * blck_size_interleave;
|
|
|
|
uint64_t elems;
|
|
// Using memcpy to avoid unaligned memory accesses
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
elems ^= xor_mask;
|
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
}
|
|
} else if (blck_size_interleave == 4) {
|
|
const uint32_t xor_mask = 0x88888888;
|
|
for (int i = 0; i < end; ++i) {
|
|
int src_id = i % 4;
|
|
int src_offset = (i / 4) * blck_size_interleave;
|
|
int dst_offset = i * blck_size_interleave;
|
|
|
|
uint32_t elems;
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
|
elems ^= xor_mask;
|
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
|
}
|
|
} else {
|
|
GGML_ASSERT(false);
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
// interleave 8 block_q4_0s in blocks of blck_size_interleave
|
|
// returns an interleaved block_q4_0x8
|
|
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
|
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
|
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
block_q4_0x8 out;
|
|
|
|
for (int i = 0; i < 8; i++) {
|
|
out.d[i] = in[i].d;
|
|
}
|
|
|
|
const int end = QK4_0 * 4 / blck_size_interleave;
|
|
const uint64_t xor_mask = 0x8888888888888888ULL;
|
|
|
|
for (int i = 0; i < end; ++i) {
|
|
int src_id = i % 8;
|
|
int src_offset = (i / 8) * blck_size_interleave;
|
|
int dst_offset = i * blck_size_interleave;
|
|
|
|
uint64_t elems;
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
|
elems ^= xor_mask;
|
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
|
|
assert(n_per_row % QK4_0 == 0);
|
|
const int nb = n_per_row / QK4_0;
|
|
|
|
void * out_ptr = NULL;
|
|
if (nrows_interleaved == 8) {
|
|
out_ptr = (block_q4_0x8 *) dst;
|
|
}
|
|
else if (nrows_interleaved == 4) {
|
|
out_ptr = (block_q4_0x4 *) dst;
|
|
}
|
|
assert(nrows_interleaved <= 8);
|
|
block_q4_0 dst_tmp[8];
|
|
|
|
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
|
|
|
|
for (int64_t x = 0; x < nb; x++) {
|
|
|
|
for (int i = 0; i < nrows_interleaved; i++ ) {
|
|
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
|
|
}
|
|
|
|
if (nrows_interleaved == 8) {
|
|
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
|
|
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
|
}
|
|
else if (nrows_interleaved == 4) {
|
|
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
|
|
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
|
|
}
|
|
|
|
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
UNUSED(quant_weights);
|
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
|
|
}
|
|
|
|
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
UNUSED(quant_weights);
|
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
|
|
}
|
|
|
|
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
|
UNUSED(quant_weights);
|
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
|
}
|