mirror of
https://github.com/ollama/ollama.git
synced 2025-04-15 07:09:23 +02:00
wip: next ollama runner
implement llama and mllama model architectures in go using ggml (through cgo)
This commit is contained in:
parent
8d15a7a964
commit
2c5fb24855
9
.gitattributes
vendored
9
.gitattributes
vendored
@ -7,5 +7,14 @@ llama/**/*.cuh linguist-vendored
|
||||
llama/**/*.m linguist-vendored
|
||||
llama/**/*.metal linguist-vendored
|
||||
|
||||
ml/backend/**/*.c linguist-vendored
|
||||
ml/backend/**/*.h linguist-vendored
|
||||
ml/backend/**/*.cpp linguist-vendored
|
||||
ml/backend/**/*.hpp linguist-vendored
|
||||
ml/backend/**/*.cu linguist-vendored
|
||||
ml/backend/**/*.cuh linguist-vendored
|
||||
ml/backend/**/*.m linguist-vendored
|
||||
ml/backend/**/*.metal linguist-vendored
|
||||
|
||||
* text=auto
|
||||
*.go text eol=lf
|
||||
|
63
cache/cache.go
vendored
Normal file
63
cache/cache.go
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
package cache
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
Position int
|
||||
}
|
||||
|
||||
type Cache interface {
|
||||
Sub(i int) Cache
|
||||
Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
|
||||
}
|
||||
|
||||
type Simple struct {
|
||||
DType ml.DType
|
||||
Capacity int
|
||||
|
||||
keys, values []ml.Tensor
|
||||
}
|
||||
|
||||
func (c *Simple) Sub(i int) Cache {
|
||||
if i >= len(c.keys) {
|
||||
c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
|
||||
c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
|
||||
}
|
||||
|
||||
return &Simple{
|
||||
keys: c.keys[i : i+1],
|
||||
values: c.values[i : i+1],
|
||||
Capacity: c.Capacity,
|
||||
DType: c.DType,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
|
||||
if c.keys[0] == nil || c.values[0] == nil {
|
||||
c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
|
||||
c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
|
||||
}
|
||||
|
||||
ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
|
||||
ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
|
||||
|
||||
n := min(c.Capacity, int(key.Dim(2))+opts.Position)
|
||||
|
||||
key = c.keys[0].View(ctx, 0,
|
||||
int(key.Dim(0)), int(key.Stride(1)),
|
||||
int(key.Dim(1)), int(key.Stride(2)),
|
||||
n,
|
||||
)
|
||||
|
||||
value = c.values[0].View(ctx, 0,
|
||||
int(value.Dim(0)), int(value.Stride(1)),
|
||||
int(value.Dim(1)), int(value.Stride(2)),
|
||||
n,
|
||||
)
|
||||
|
||||
// TODO shift context if necessary
|
||||
|
||||
return key, value
|
||||
}
|
305
fs/ggml/ggml.go
Normal file
305
fs/ggml/ggml.go
Normal file
@ -0,0 +1,305 @@
|
||||
package ggml
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/util/bufioutil"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
container
|
||||
model
|
||||
}
|
||||
|
||||
type model interface {
|
||||
KV() KV
|
||||
Tensors() Tensors
|
||||
}
|
||||
|
||||
type KV map[string]any
|
||||
|
||||
func (kv KV) Architecture() string {
|
||||
return cmp.Or(kv.String("general.architecture"), "unknown")
|
||||
}
|
||||
|
||||
func (kv KV) FileType() fileType {
|
||||
if t := kv.Uint("general.file_type"); t > 0 {
|
||||
return fileType(t)
|
||||
}
|
||||
|
||||
return fileTypeUnknown
|
||||
}
|
||||
|
||||
func (kv KV) String(key string, defaultValue ...string) string {
|
||||
return keyValue(kv, key, append(defaultValue, "")...)
|
||||
}
|
||||
|
||||
func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
|
||||
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||
}
|
||||
|
||||
func (kv KV) Float(key string, defaultValue ...float32) float32 {
|
||||
return keyValue(kv, key, append(defaultValue, 0)...)
|
||||
}
|
||||
|
||||
func (kv KV) Strings(key string, defaultValue ...[]string) []string {
|
||||
r := keyValue(kv, key, &array{})
|
||||
s := make([]string, r.size)
|
||||
for i := range r.size {
|
||||
s[i] = r.values[i].(string)
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
|
||||
r := keyValue(kv, key, &array{})
|
||||
s := make([]uint32, r.size)
|
||||
for i := range r.size {
|
||||
s[i] = uint32(r.values[i].(int32))
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func keyValue[T string | uint32 | float32 | *array](kv KV, key string, defaultValue ...T) T {
|
||||
if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
|
||||
key = kv.Architecture() + "." + key
|
||||
}
|
||||
|
||||
if val, ok := kv[key]; ok {
|
||||
return val.(T)
|
||||
}
|
||||
|
||||
slog.Warn("key not found", "key", key, "default", defaultValue[0])
|
||||
return defaultValue[0]
|
||||
}
|
||||
|
||||
type Tensors struct {
|
||||
Items []*Tensor
|
||||
Offset uint64
|
||||
}
|
||||
|
||||
func (ts Tensors) Layers() map[string]Layer {
|
||||
layers := make(map[string]Layer)
|
||||
for _, t := range ts.Items {
|
||||
parts := strings.Split(t.Name, ".")
|
||||
if parts[0] == "blk" {
|
||||
// join first and second part, e.g. blk.%d
|
||||
parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
|
||||
}
|
||||
|
||||
if _, ok := layers[parts[0]]; !ok {
|
||||
layers[parts[0]] = make(Layer)
|
||||
}
|
||||
|
||||
layers[parts[0]][strings.Join(parts[1:], ".")] = t
|
||||
}
|
||||
|
||||
return layers
|
||||
}
|
||||
|
||||
type Layer map[string]*Tensor
|
||||
|
||||
func (l Layer) size() (size uint64) {
|
||||
for _, t := range l {
|
||||
size += t.Size()
|
||||
}
|
||||
|
||||
return size
|
||||
}
|
||||
|
||||
type Tensor struct {
|
||||
Name string `json:"name"`
|
||||
Kind uint32 `json:"kind"`
|
||||
Offset uint64 `json:"-"`
|
||||
|
||||
// Shape is the number of elements in each dimension
|
||||
Shape []uint64 `json:"shape"`
|
||||
|
||||
io.WriterTo `json:"-"`
|
||||
}
|
||||
|
||||
func (t Tensor) block() (n int) {
|
||||
if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
|
||||
return -1
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (t Tensor) blockSize() uint64 {
|
||||
switch t.Kind {
|
||||
case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
|
||||
return 1
|
||||
case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL
|
||||
return 32
|
||||
default: // All others
|
||||
return 256
|
||||
}
|
||||
}
|
||||
|
||||
func (t Tensor) typeSize() uint64 {
|
||||
blockSize := t.blockSize()
|
||||
|
||||
switch t.Kind {
|
||||
case 0: // FP32
|
||||
return 4
|
||||
case 1: // FP16
|
||||
return 2
|
||||
case 2: // Q4_0
|
||||
return 2 + blockSize/2
|
||||
case 3: // Q4_1
|
||||
return 2 + 2 + blockSize/2
|
||||
case 6: // Q5_0
|
||||
return 2 + 4 + blockSize/2
|
||||
case 7: // Q5_1
|
||||
return 2 + 2 + 4 + blockSize/2
|
||||
case 8: // Q8_0
|
||||
return 2 + blockSize
|
||||
case 9: // Q8_1
|
||||
return 4 + 4 + blockSize
|
||||
case 10: // Q2_K
|
||||
return blockSize/16 + blockSize/4 + 2 + 2
|
||||
case 11: // Q3_K
|
||||
return blockSize/8 + blockSize/4 + 12 + 2
|
||||
case 12: // Q4_K
|
||||
return 2 + 2 + 12 + blockSize/2
|
||||
case 13: // Q5_K
|
||||
return 2 + 2 + 12 + blockSize/8 + blockSize/2
|
||||
case 14: // Q6_K
|
||||
return blockSize/2 + blockSize/4 + blockSize/16 + 2
|
||||
case 15: // Q8_K
|
||||
return 2 + blockSize + 2*blockSize/16
|
||||
case 16: // IQ2_XXS
|
||||
return 2 + 2*blockSize/8
|
||||
case 17: // IQ2_XS
|
||||
return 2 + 2*blockSize/8 + blockSize/32
|
||||
case 18: // IQ3_XXS
|
||||
return 2 + blockSize/4 + blockSize/8
|
||||
case 19: // IQ1_S
|
||||
return 2 + blockSize/8 + blockSize/16
|
||||
case 20: // IQ4_NL
|
||||
return 2 + blockSize/2
|
||||
case 21: // IQ3_S
|
||||
return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
|
||||
case 22: // IQ2_S
|
||||
return 2 + blockSize/4 + blockSize/16
|
||||
case 23: // IQ4_XS
|
||||
return 2 + 2 + blockSize/2 + blockSize/64
|
||||
case 24: // I8
|
||||
return 1
|
||||
case 25: // I16
|
||||
return 2
|
||||
case 26: // I32
|
||||
return 4
|
||||
case 27: // I64
|
||||
return 8
|
||||
case 28: // F64
|
||||
return 8
|
||||
case 29: // IQ1_M
|
||||
return blockSize/8 + blockSize/16 + blockSize/32
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func (t Tensor) parameters() uint64 {
|
||||
var count uint64 = 1
|
||||
for _, n := range t.Shape {
|
||||
count *= n
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func (t Tensor) Size() uint64 {
|
||||
return t.parameters() * t.typeSize() / t.blockSize()
|
||||
}
|
||||
|
||||
type container interface {
|
||||
Name() string
|
||||
Decode(io.ReadSeeker) (model, error)
|
||||
}
|
||||
|
||||
const (
|
||||
// Magic constant for `ggml` files (unversioned).
|
||||
FILE_MAGIC_GGML = 0x67676d6c
|
||||
// Magic constant for `ggml` files (versioned, ggmf).
|
||||
FILE_MAGIC_GGMF = 0x67676d66
|
||||
// Magic constant for `ggml` files (versioned, ggjt).
|
||||
FILE_MAGIC_GGJT = 0x67676a74
|
||||
// Magic constant for `ggla` files (LoRA adapter).
|
||||
FILE_MAGIC_GGLA = 0x67676C61
|
||||
// Magic constant for `gguf` files (versioned, gguf)
|
||||
FILE_MAGIC_GGUF_LE = 0x46554747
|
||||
FILE_MAGIC_GGUF_BE = 0x47475546
|
||||
)
|
||||
|
||||
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||
|
||||
func DetectGGMLType(b []byte) string {
|
||||
switch binary.LittleEndian.Uint32(b[:4]) {
|
||||
case FILE_MAGIC_GGML:
|
||||
return "ggml"
|
||||
case FILE_MAGIC_GGMF:
|
||||
return "ggmf"
|
||||
case FILE_MAGIC_GGJT:
|
||||
return "ggjt"
|
||||
case FILE_MAGIC_GGLA:
|
||||
return "ggla"
|
||||
case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
|
||||
return "gguf"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// DecodeGGML decodes a GGML model from the given reader.
|
||||
//
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||
// the maxArraySize is negative, all arrays are collected.
|
||||
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||
if maxArraySize == 0 {
|
||||
maxArraySize = 1024
|
||||
}
|
||||
|
||||
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||
|
||||
var magic uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
var c container
|
||||
switch magic {
|
||||
case FILE_MAGIC_GGUF_LE:
|
||||
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
||||
case FILE_MAGIC_GGUF_BE:
|
||||
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||
default:
|
||||
return nil, 0, errors.New("invalid file magic")
|
||||
}
|
||||
|
||||
model, err := c.Decode(rs)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// final model type
|
||||
return &GGML{
|
||||
container: c,
|
||||
model: model,
|
||||
}, offset, nil
|
||||
}
|
661
fs/ggml/gguf.go
Normal file
661
fs/ggml/gguf.go
Normal file
@ -0,0 +1,661 @@
|
||||
package ggml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"cmp"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"slices"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
ByteOrder binary.ByteOrder
|
||||
|
||||
Version uint32
|
||||
|
||||
V1 struct {
|
||||
NumTensor uint32
|
||||
NumKV uint32
|
||||
}
|
||||
|
||||
V2 struct {
|
||||
NumTensor uint64
|
||||
NumKV uint64
|
||||
}
|
||||
|
||||
V3 struct {
|
||||
NumTensor uint64
|
||||
NumKV uint64
|
||||
}
|
||||
|
||||
maxArraySize int
|
||||
}
|
||||
|
||||
func (c *containerGGUF) canCollectArray(size int) bool {
|
||||
return c.maxArraySize < 0 || size <= c.maxArraySize
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Name() string {
|
||||
return "gguf"
|
||||
}
|
||||
|
||||
func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
|
||||
if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var err error
|
||||
switch c.Version {
|
||||
case 1:
|
||||
err = binary.Read(rs, c.ByteOrder, &c.V1)
|
||||
case 2:
|
||||
err = binary.Read(rs, c.ByteOrder, &c.V2)
|
||||
default:
|
||||
err = binary.Read(rs, c.ByteOrder, &c.V3)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
model := newGGUF(c)
|
||||
if err := model.Decode(rs); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return model, nil
|
||||
}
|
||||
|
||||
const (
|
||||
ggufTypeUint8 uint32 = iota
|
||||
ggufTypeInt8
|
||||
ggufTypeUint16
|
||||
ggufTypeInt16
|
||||
ggufTypeUint32
|
||||
ggufTypeInt32
|
||||
ggufTypeFloat32
|
||||
ggufTypeBool
|
||||
ggufTypeString
|
||||
ggufTypeArray
|
||||
ggufTypeUint64
|
||||
ggufTypeInt64
|
||||
ggufTypeFloat64
|
||||
)
|
||||
|
||||
type gguf struct {
|
||||
*containerGGUF
|
||||
|
||||
kv KV
|
||||
tensors []*Tensor
|
||||
|
||||
parameters uint64
|
||||
tensorOffset uint64
|
||||
|
||||
scratch [16 << 10]byte
|
||||
}
|
||||
|
||||
func newGGUF(container *containerGGUF) *gguf {
|
||||
return &gguf{
|
||||
containerGGUF: container,
|
||||
kv: make(KV),
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *gguf) KV() KV {
|
||||
return llm.kv
|
||||
}
|
||||
|
||||
func (llm *gguf) Tensors() Tensors {
|
||||
return Tensors{
|
||||
Items: llm.tensors,
|
||||
Offset: llm.tensorOffset,
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *gguf) numTensor() uint64 {
|
||||
switch llm.Version {
|
||||
case 1:
|
||||
return uint64(llm.V1.NumTensor)
|
||||
case 2:
|
||||
return llm.V2.NumTensor
|
||||
default:
|
||||
return llm.V3.NumTensor
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *gguf) numKV() uint64 {
|
||||
switch llm.Version {
|
||||
case 1:
|
||||
return uint64(llm.V1.NumKV)
|
||||
case 2:
|
||||
return llm.V2.NumKV
|
||||
default:
|
||||
return llm.V3.NumKV
|
||||
}
|
||||
}
|
||||
|
||||
func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||
// decode key-values
|
||||
for i := 0; uint64(i) < llm.numKV(); i++ {
|
||||
k, err := readGGUFString(llm, rs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
t, err := readGGUF[uint32](llm, rs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var v any
|
||||
switch t {
|
||||
case ggufTypeUint8:
|
||||
v, err = readGGUF[uint8](llm, rs)
|
||||
case ggufTypeInt8:
|
||||
v, err = readGGUF[int8](llm, rs)
|
||||
case ggufTypeUint16:
|
||||
v, err = readGGUF[uint16](llm, rs)
|
||||
case ggufTypeInt16:
|
||||
v, err = readGGUF[int16](llm, rs)
|
||||
case ggufTypeUint32:
|
||||
v, err = readGGUF[uint32](llm, rs)
|
||||
case ggufTypeInt32:
|
||||
v, err = readGGUF[int32](llm, rs)
|
||||
case ggufTypeUint64:
|
||||
v, err = readGGUF[uint64](llm, rs)
|
||||
case ggufTypeInt64:
|
||||
v, err = readGGUF[int64](llm, rs)
|
||||
case ggufTypeFloat32:
|
||||
v, err = readGGUF[float32](llm, rs)
|
||||
case ggufTypeFloat64:
|
||||
v, err = readGGUF[float64](llm, rs)
|
||||
case ggufTypeBool:
|
||||
v, err = readGGUF[bool](llm, rs)
|
||||
case ggufTypeString:
|
||||
v, err = readGGUFString(llm, rs)
|
||||
case ggufTypeArray:
|
||||
v, err = readGGUFArray(llm, rs)
|
||||
default:
|
||||
return fmt.Errorf("invalid type: %d", t)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
llm.kv[k] = v
|
||||
}
|
||||
|
||||
// decode tensors
|
||||
for range llm.numTensor() {
|
||||
name, err := readGGUFString(llm, rs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read tensor name: %w", err)
|
||||
}
|
||||
|
||||
// dims is the number of dimensions in the tensor
|
||||
dims, err := readGGUF[uint32](llm, rs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read tensor dimensions: %w", err)
|
||||
}
|
||||
|
||||
shape := make([]uint64, dims)
|
||||
for i := 0; uint32(i) < dims; i++ {
|
||||
shape[i], err = readGGUF[uint64](llm, rs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read tensor shape: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
kind, err := readGGUF[uint32](llm, rs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read tensor kind: %w", err)
|
||||
}
|
||||
|
||||
offset, err := readGGUF[uint64](llm, rs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read tensor offset: %w", err)
|
||||
}
|
||||
|
||||
tensor := Tensor{
|
||||
Name: name,
|
||||
Kind: kind,
|
||||
Offset: offset,
|
||||
Shape: shape[:],
|
||||
}
|
||||
|
||||
llm.tensors = append(llm.tensors, &tensor)
|
||||
llm.parameters += tensor.parameters()
|
||||
}
|
||||
|
||||
// patch KV with parameter count
|
||||
llm.kv["general.parameter_count"] = llm.parameters
|
||||
|
||||
alignment, ok := llm.kv["general.alignment"].(uint32)
|
||||
if !ok {
|
||||
alignment = 32
|
||||
}
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
padding := ggufPadding(offset, int64(alignment))
|
||||
llm.tensorOffset = uint64(offset + padding)
|
||||
|
||||
for _, tensor := range llm.tensors {
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get current offset: %w", err)
|
||||
}
|
||||
|
||||
padding := ggufPadding(offset, int64(alignment))
|
||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
||||
return fmt.Errorf("failed to seek to init padding: %w", err)
|
||||
}
|
||||
|
||||
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
||||
return fmt.Errorf("failed to seek to tensor: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
|
||||
var t T
|
||||
err := binary.Read(r, llm.ByteOrder, &t)
|
||||
return t, err
|
||||
}
|
||||
|
||||
func writeGGUF[V any](w io.Writer, t uint32, v V) error {
|
||||
if err := binary.Write(w, binary.LittleEndian, t); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return binary.Write(w, binary.LittleEndian, v)
|
||||
}
|
||||
|
||||
func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
|
||||
var length uint64
|
||||
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// gguf v1 strings are null-terminated
|
||||
b.Truncate(b.Len() - 1)
|
||||
|
||||
return b.String(), nil
|
||||
}
|
||||
|
||||
func discardGGUFString(llm *gguf, r io.Reader) error {
|
||||
buf := llm.scratch[:8]
|
||||
_, err := io.ReadFull(r, buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
size := int(llm.ByteOrder.Uint64(buf))
|
||||
for size > 0 {
|
||||
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
size -= n
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
||||
if llm.Version == 1 {
|
||||
return readGGUFV1String(llm, r)
|
||||
}
|
||||
|
||||
buf := llm.scratch[:8]
|
||||
_, err := io.ReadFull(r, buf)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
length := int(llm.ByteOrder.Uint64(buf))
|
||||
if length > len(llm.scratch) {
|
||||
buf = make([]byte, length)
|
||||
} else {
|
||||
buf = llm.scratch[:length]
|
||||
}
|
||||
clear(buf)
|
||||
|
||||
_, err = io.ReadFull(r, buf)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(buf), nil
|
||||
}
|
||||
|
||||
func writeGGUFString(w io.Writer, s string) error {
|
||||
if err := binary.Write(w, binary.LittleEndian, ggufTypeString); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err := io.Copy(w, strings.NewReader(s))
|
||||
return err
|
||||
}
|
||||
|
||||
type array struct {
|
||||
size int
|
||||
values []any
|
||||
}
|
||||
|
||||
func (a *array) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(a.values)
|
||||
}
|
||||
|
||||
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
|
||||
t, err := readGGUF[uint32](llm, r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
n, err := readGGUF[uint32](llm, r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
a := &array{size: int(n)}
|
||||
if llm.canCollectArray(int(n)) {
|
||||
a.values = make([]any, 0, int(n))
|
||||
}
|
||||
|
||||
for i := range n {
|
||||
var e any
|
||||
switch t {
|
||||
case ggufTypeUint8:
|
||||
e, err = readGGUF[uint8](llm, r)
|
||||
case ggufTypeInt8:
|
||||
e, err = readGGUF[int8](llm, r)
|
||||
case ggufTypeUint16:
|
||||
e, err = readGGUF[uint16](llm, r)
|
||||
case ggufTypeInt16:
|
||||
e, err = readGGUF[int16](llm, r)
|
||||
case ggufTypeUint32:
|
||||
e, err = readGGUF[uint32](llm, r)
|
||||
case ggufTypeInt32:
|
||||
e, err = readGGUF[int32](llm, r)
|
||||
case ggufTypeUint64:
|
||||
e, err = readGGUF[uint64](llm, r)
|
||||
case ggufTypeInt64:
|
||||
e, err = readGGUF[int64](llm, r)
|
||||
case ggufTypeFloat32:
|
||||
e, err = readGGUF[float32](llm, r)
|
||||
case ggufTypeFloat64:
|
||||
e, err = readGGUF[float64](llm, r)
|
||||
case ggufTypeBool:
|
||||
e, err = readGGUF[bool](llm, r)
|
||||
case ggufTypeString:
|
||||
e, err = readGGUFV1String(llm, r)
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if a.values != nil {
|
||||
a.values[i] = e
|
||||
}
|
||||
}
|
||||
|
||||
return a, nil
|
||||
}
|
||||
|
||||
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
|
||||
if llm.Version == 1 {
|
||||
return readGGUFV1Array(llm, r)
|
||||
}
|
||||
|
||||
t, err := readGGUF[uint32](llm, r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
n, err := readGGUF[uint64](llm, r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
a := &array{size: int(n)}
|
||||
if llm.canCollectArray(int(n)) {
|
||||
a.values = make([]any, int(n))
|
||||
}
|
||||
|
||||
for i := range n {
|
||||
var e any
|
||||
switch t {
|
||||
case ggufTypeUint8:
|
||||
e, err = readGGUF[uint8](llm, r)
|
||||
case ggufTypeInt8:
|
||||
e, err = readGGUF[int8](llm, r)
|
||||
case ggufTypeUint16:
|
||||
e, err = readGGUF[uint16](llm, r)
|
||||
case ggufTypeInt16:
|
||||
e, err = readGGUF[int16](llm, r)
|
||||
case ggufTypeUint32:
|
||||
e, err = readGGUF[uint32](llm, r)
|
||||
case ggufTypeInt32:
|
||||
e, err = readGGUF[int32](llm, r)
|
||||
case ggufTypeUint64:
|
||||
e, err = readGGUF[uint64](llm, r)
|
||||
case ggufTypeInt64:
|
||||
e, err = readGGUF[int64](llm, r)
|
||||
case ggufTypeFloat32:
|
||||
e, err = readGGUF[float32](llm, r)
|
||||
case ggufTypeFloat64:
|
||||
e, err = readGGUF[float64](llm, r)
|
||||
case ggufTypeBool:
|
||||
e, err = readGGUF[bool](llm, r)
|
||||
case ggufTypeString:
|
||||
if a.values != nil {
|
||||
e, err = readGGUFString(llm, r)
|
||||
} else {
|
||||
err = discardGGUFString(llm, r)
|
||||
}
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if a.values != nil {
|
||||
a.values[i] = e
|
||||
}
|
||||
}
|
||||
|
||||
return a, nil
|
||||
}
|
||||
|
||||
// writeGGUFArray writes a slice s of type E to the write with a gguf type of t
|
||||
func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
|
||||
if err := binary.Write(w, binary.LittleEndian, ggufTypeArray); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(w, binary.LittleEndian, t); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return binary.Write(w, binary.LittleEndian, s)
|
||||
}
|
||||
|
||||
func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
|
||||
if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
keys := slices.Collect(maps.Keys(kv))
|
||||
slices.Sort(keys)
|
||||
|
||||
for _, key := range keys {
|
||||
if err := ggufWriteKV(ws, key, kv[key]); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
slices.SortStableFunc(ts, func(a, b Tensor) int {
|
||||
if i, j := a.block(), b.block(); i < 0 && j > 0 {
|
||||
return 1
|
||||
} else if i > 0 && j < 0 {
|
||||
return -1
|
||||
} else {
|
||||
return cmp.Compare(i, j)
|
||||
}
|
||||
})
|
||||
|
||||
var s uint64
|
||||
for _, t := range ts {
|
||||
t.Offset = s
|
||||
if err := ggufWriteTensorInfo(ws, t); err != nil {
|
||||
return err
|
||||
}
|
||||
s += t.Size()
|
||||
}
|
||||
|
||||
var alignment int64 = 32
|
||||
for _, t := range ts {
|
||||
if err := ggufWriteTensor(ws, t, alignment); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
|
||||
slog.Debug(k, "type", fmt.Sprintf("%T", v))
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(k))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, []byte(k)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var err error
|
||||
switch v := v.(type) {
|
||||
case uint32:
|
||||
err = writeGGUF(ws, ggufTypeUint32, v)
|
||||
case float32:
|
||||
err = writeGGUF(ws, ggufTypeFloat32, v)
|
||||
case bool:
|
||||
err = writeGGUF(ws, ggufTypeBool, v)
|
||||
case string:
|
||||
err = writeGGUFString(ws, v)
|
||||
case []int32:
|
||||
err = writeGGUFArray(ws, ggufTypeInt32, v)
|
||||
case []uint32:
|
||||
err = writeGGUFArray(ws, ggufTypeUint32, v)
|
||||
case []float32:
|
||||
err = writeGGUFArray(ws, ggufTypeFloat32, v)
|
||||
case []string:
|
||||
if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, e := range v {
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("improper type for '%s'", k)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
|
||||
slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Shape))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for i := range len(t.Shape) {
|
||||
if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return binary.Write(ws, binary.LittleEndian, t.Offset)
|
||||
}
|
||||
|
||||
func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
|
||||
offset, err := ws.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = t.WriteTo(ws)
|
||||
return err
|
||||
}
|
||||
|
||||
func ggufPadding(offset, align int64) int64 {
|
||||
return (align - offset%align) % align
|
||||
}
|
180
fs/ggml/type.go
Normal file
180
fs/ggml/type.go
Normal file
@ -0,0 +1,180 @@
|
||||
package ggml
|
||||
|
||||
import "fmt"
|
||||
|
||||
type fileType uint32
|
||||
|
||||
const (
|
||||
fileTypeF32 fileType = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
fileTypeQ4_1_F16
|
||||
fileTypeQ4_2 // unused
|
||||
fileTypeQ4_3 // unused
|
||||
fileTypeQ8_0
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
fileTypeQ4_K_S
|
||||
fileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
fileTypeIQ2_XXS
|
||||
fileTypeIQ2_XS
|
||||
fileTypeQ2_K_S
|
||||
fileTypeIQ3_XS
|
||||
fileTypeIQ3_XXS
|
||||
fileTypeIQ1_S
|
||||
fileTypeIQ4_NL
|
||||
fileTypeIQ3_S
|
||||
fileTypeIQ2_S
|
||||
fileTypeIQ4_XS
|
||||
fileTypeIQ2_M
|
||||
fileTypeIQ1_M
|
||||
fileTypeBF16
|
||||
|
||||
fileTypeUnknown
|
||||
)
|
||||
|
||||
func ParseFileType(s string) (fileType, error) {
|
||||
switch s {
|
||||
case "F32":
|
||||
return fileTypeF32, nil
|
||||
case "F16":
|
||||
return fileTypeF16, nil
|
||||
case "Q4_0":
|
||||
return fileTypeQ4_0, nil
|
||||
case "Q4_1":
|
||||
return fileTypeQ4_1, nil
|
||||
case "Q4_1_F16":
|
||||
return fileTypeQ4_1_F16, nil
|
||||
case "Q8_0":
|
||||
return fileTypeQ8_0, nil
|
||||
case "Q5_0":
|
||||
return fileTypeQ5_0, nil
|
||||
case "Q5_1":
|
||||
return fileTypeQ5_1, nil
|
||||
case "Q2_K":
|
||||
return fileTypeQ2_K, nil
|
||||
case "Q3_K_S":
|
||||
return fileTypeQ3_K_S, nil
|
||||
case "Q3_K_M":
|
||||
return fileTypeQ3_K_M, nil
|
||||
case "Q3_K_L":
|
||||
return fileTypeQ3_K_L, nil
|
||||
case "Q4_K_S":
|
||||
return fileTypeQ4_K_S, nil
|
||||
case "Q4_K_M":
|
||||
return fileTypeQ4_K_M, nil
|
||||
case "Q5_K_S":
|
||||
return fileTypeQ5_K_S, nil
|
||||
case "Q5_K_M":
|
||||
return fileTypeQ5_K_M, nil
|
||||
case "Q6_K":
|
||||
return fileTypeQ6_K, nil
|
||||
case "IQ2_XXS":
|
||||
return fileTypeIQ2_XXS, nil
|
||||
case "IQ2_XS":
|
||||
return fileTypeIQ2_XS, nil
|
||||
case "Q2_K_S":
|
||||
return fileTypeQ2_K_S, nil
|
||||
case "IQ3_XS":
|
||||
return fileTypeIQ3_XS, nil
|
||||
case "IQ3_XXS":
|
||||
return fileTypeIQ3_XXS, nil
|
||||
case "IQ1_S":
|
||||
return fileTypeIQ1_S, nil
|
||||
case "IQ4_NL":
|
||||
return fileTypeIQ4_NL, nil
|
||||
case "IQ3_S":
|
||||
return fileTypeIQ3_S, nil
|
||||
case "IQ2_S":
|
||||
return fileTypeIQ2_S, nil
|
||||
case "IQ4_XS":
|
||||
return fileTypeIQ4_XS, nil
|
||||
case "IQ2_M":
|
||||
return fileTypeIQ2_M, nil
|
||||
case "IQ1_M":
|
||||
return fileTypeIQ1_M, nil
|
||||
case "BF16":
|
||||
return fileTypeBF16, nil
|
||||
default:
|
||||
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) String() string {
|
||||
switch t {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
case fileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case fileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case fileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case fileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case fileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
case fileTypeIQ2_XXS:
|
||||
return "IQ2_XXS"
|
||||
case fileTypeIQ2_XS:
|
||||
return "IQ2_XS"
|
||||
case fileTypeQ2_K_S:
|
||||
return "Q2_K_S"
|
||||
case fileTypeIQ3_XS:
|
||||
return "IQ3_XS"
|
||||
case fileTypeIQ3_XXS:
|
||||
return "IQ3_XXS"
|
||||
case fileTypeIQ1_S:
|
||||
return "IQ1_S"
|
||||
case fileTypeIQ4_NL:
|
||||
return "IQ4_NL"
|
||||
case fileTypeIQ3_S:
|
||||
return "IQ3_S"
|
||||
case fileTypeIQ2_S:
|
||||
return "IQ2_S"
|
||||
case fileTypeIQ4_XS:
|
||||
return "IQ4_XS"
|
||||
case fileTypeIQ2_M:
|
||||
return "IQ2_M"
|
||||
case fileTypeIQ1_M:
|
||||
return "IQ1_M"
|
||||
case fileTypeBF16:
|
||||
return "BF16"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) Value() uint32 {
|
||||
return uint32(t)
|
||||
}
|
4
go.mod
4
go.mod
@ -18,11 +18,14 @@ require (
|
||||
require (
|
||||
github.com/agnivade/levenshtein v1.1.1
|
||||
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
|
||||
github.com/dlclark/regexp2 v1.11.4
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
||||
github.com/google/go-cmp v0.6.0
|
||||
github.com/mattn/go-runewidth v0.0.14
|
||||
github.com/nlpodyssey/gopickle v0.3.0
|
||||
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
||||
golang.org/x/image v0.22.0
|
||||
gonum.org/v1/gonum v0.15.0
|
||||
)
|
||||
|
||||
require (
|
||||
@ -42,7 +45,6 @@ require (
|
||||
github.com/xtgo/set v1.0.0 // indirect
|
||||
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
|
||||
gonum.org/v1/gonum v0.15.0 // indirect
|
||||
gorgonia.org/vecf32 v0.9.0 // indirect
|
||||
gorgonia.org/vecf64 v0.9.0 // indirect
|
||||
)
|
||||
|
4
go.sum
4
go.sum
@ -42,8 +42,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
|
||||
github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
|
||||
github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
|
||||
github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
|
||||
github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
|
||||
github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
|
||||
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
|
||||
|
@ -9,7 +9,7 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/ollama/ollama/util/bufioutil"
|
||||
"github.com/ollama/ollama/fs/util/bufioutil"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
|
191
ml/backend.go
Normal file
191
ml/backend.go
Normal file
@ -0,0 +1,191 @@
|
||||
package ml
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Config interface {
|
||||
Architecture() string
|
||||
String(string, ...string) string
|
||||
Uint(string, ...uint32) uint32
|
||||
Float(string, ...float32) float32
|
||||
|
||||
Strings(string, ...[]string) []string
|
||||
Uints(string, ...[]uint32) []uint32
|
||||
}
|
||||
|
||||
type Backend interface {
|
||||
Config() Config
|
||||
Get(name string) Tensor
|
||||
NewContext() Context
|
||||
}
|
||||
|
||||
var backends = make(map[string]func(io.ReadSeeker) (Backend, error))
|
||||
|
||||
func RegisterBackend(name string, f func(io.ReadSeeker) (Backend, error)) {
|
||||
if _, ok := backends[name]; ok {
|
||||
panic("backend: backend already registered")
|
||||
}
|
||||
|
||||
backends[name] = f
|
||||
}
|
||||
|
||||
func NewBackend(r io.ReadSeeker) (Backend, error) {
|
||||
if backend, ok := backends["ggml"]; ok {
|
||||
return backend(r)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("unsupported backend")
|
||||
}
|
||||
|
||||
type Context interface {
|
||||
Zeros(dtype DType, shape ...int) Tensor
|
||||
FromFloatSlice(s []float32, shape ...int) (Tensor, error)
|
||||
FromIntSlice(s []int32, shape ...int) (Tensor, error)
|
||||
|
||||
Forward(Tensor)
|
||||
Compute(Tensor) Tensor
|
||||
Close() error
|
||||
}
|
||||
|
||||
type Tensor interface {
|
||||
Dim(n int) int64
|
||||
Stride(n int) int64
|
||||
|
||||
Shape() []int64
|
||||
DType() DType
|
||||
|
||||
Bytes() []byte
|
||||
Floats() []float32
|
||||
|
||||
Add(ctx Context, t2 Tensor) Tensor
|
||||
Mul(ctx Context, t2 Tensor) Tensor
|
||||
Mulmat(ctx Context, t2 Tensor) Tensor
|
||||
|
||||
Softmax(ctx Context) Tensor
|
||||
Norm(ctx Context, eps float32) Tensor
|
||||
RMSNorm(ctx Context, eps float32) Tensor
|
||||
Scale(ctx Context, s float64) Tensor
|
||||
|
||||
Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
|
||||
Rope(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
|
||||
|
||||
Tanh(ctx Context) Tensor
|
||||
GELU(ctx Context) Tensor
|
||||
SILU(ctx Context) Tensor
|
||||
|
||||
Reshape(ctx Context, shape ...int64) Tensor
|
||||
View(ctx Context, offset int, shape ...int) Tensor
|
||||
Permute(ctx Context, shape ...int) Tensor
|
||||
Contiguous(ctx Context) Tensor
|
||||
|
||||
Pad(ctx Context, shape ...int64) Tensor
|
||||
Unpad(ctx Context, shape ...int64) Tensor
|
||||
|
||||
Stack(ctx Context, dim int, s ...Tensor) Tensor
|
||||
Concat(ctx Context, t2 Tensor, dim int) Tensor
|
||||
Rows(ctx Context, t2 Tensor) Tensor
|
||||
Copy(ctx Context, t2 Tensor) Tensor
|
||||
}
|
||||
|
||||
type number interface {
|
||||
~int | ~int8 | ~int16 | ~int32 | ~int64 |
|
||||
~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
|
||||
~float32 | ~float64 |
|
||||
~complex64 | ~complex128
|
||||
}
|
||||
|
||||
func mul[T number](s ...T) T {
|
||||
p := T(1)
|
||||
for _, v := range s {
|
||||
p *= v
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
type DumpOptions struct {
|
||||
// Items is the number of elements to print at the beginning and end of each dimension.
|
||||
Items int64
|
||||
|
||||
// Precision is the number of decimal places to print. Applies to float32 and float64.
|
||||
Precision int
|
||||
}
|
||||
|
||||
func Dump(t Tensor, opts ...DumpOptions) string {
|
||||
if len(opts) < 1 {
|
||||
opts = append(opts, DumpOptions{
|
||||
Items: 3,
|
||||
Precision: 4,
|
||||
})
|
||||
}
|
||||
|
||||
switch t.DType() {
|
||||
case DTypeF32:
|
||||
return dump[[]float32](t, opts[0])
|
||||
case DTypeI32:
|
||||
return dump[[]int32](t, opts[0])
|
||||
default:
|
||||
return "<unsupported>"
|
||||
}
|
||||
}
|
||||
|
||||
func dump[S ~[]E, E number](t Tensor, opts DumpOptions) string {
|
||||
bts := t.Bytes()
|
||||
if bts == nil {
|
||||
return "<nil>"
|
||||
}
|
||||
|
||||
s := make(S, mul(t.Shape()...))
|
||||
if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
shape := t.Shape()
|
||||
|
||||
var sb strings.Builder
|
||||
var f func([]int64, int64)
|
||||
f = func(dims []int64, stride int64) {
|
||||
prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
|
||||
fmt.Fprint(&sb, "[")
|
||||
defer func() { fmt.Fprint(&sb, "]") }()
|
||||
for i := int64(0); i < dims[0]; i++ {
|
||||
if i >= opts.Items && i < dims[0]-opts.Items {
|
||||
fmt.Fprint(&sb, "..., ")
|
||||
// skip to next printable element
|
||||
skip := dims[0] - 2*opts.Items
|
||||
if len(dims) > 1 {
|
||||
stride += mul(append(dims[1:], skip)...)
|
||||
fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
|
||||
}
|
||||
i += skip - 1
|
||||
} else if len(dims) > 1 {
|
||||
f(dims[1:], stride)
|
||||
stride += mul(dims[1:]...)
|
||||
if i < dims[0]-1 {
|
||||
fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
|
||||
}
|
||||
} else {
|
||||
fmt.Fprint(&sb, s[stride+i])
|
||||
if i < dims[0]-1 {
|
||||
fmt.Fprint(&sb, ", ")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
f(shape, 0)
|
||||
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
type DType int
|
||||
|
||||
const (
|
||||
DTypeF32 DType = iota
|
||||
DTypeI32
|
||||
DTypeOther
|
||||
)
|
5
ml/backend/backend.go
Normal file
5
ml/backend/backend.go
Normal file
@ -0,0 +1,5 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
_ "github.com/ollama/ollama/ml/backend/ggml"
|
||||
)
|
470
ml/backend/ggml/backend.go
Normal file
470
ml/backend/ggml/backend.go
Normal file
@ -0,0 +1,470 @@
|
||||
package ggml
|
||||
|
||||
// #cgo CPPFLAGS: -DNDEBUG
|
||||
// #include <stdlib.h>
|
||||
// #include <stdint.h>
|
||||
// #include "ggml.h"
|
||||
// #include "ggml-backend.h"
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"unsafe"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type Backend struct {
|
||||
c *C.struct_ggml_context
|
||||
b *C.struct_ggml_backend
|
||||
bb *C.struct_ggml_backend_buffer
|
||||
|
||||
ggml.KV
|
||||
ggml.Tensors
|
||||
}
|
||||
|
||||
func New(r io.ReadSeeker) (ml.Backend, error) {
|
||||
f, _, err := ggml.DecodeGGML(r, -1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Info(
|
||||
"",
|
||||
"architecture", f.KV().Architecture(),
|
||||
"file_type", f.KV().FileType(),
|
||||
"name", f.KV().String("general.name"),
|
||||
"description", f.KV().String("general.description"),
|
||||
"num_tensors", len(f.Tensors().Items),
|
||||
"num_key_values", len(f.KV()),
|
||||
)
|
||||
|
||||
c := C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_size: C.size_t(len(f.Tensors().Items)) * C.ggml_tensor_overhead(),
|
||||
mem_buffer: nil,
|
||||
no_alloc: true,
|
||||
})
|
||||
|
||||
for _, t := range f.Tensors().Items {
|
||||
func() {
|
||||
cname := C.CString(t.Name)
|
||||
defer C.free(unsafe.Pointer(cname))
|
||||
|
||||
tt := C.ggml_new_tensor(c, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0])))
|
||||
C.ggml_set_name(tt, cname)
|
||||
}()
|
||||
}
|
||||
|
||||
b := newBackend()
|
||||
bb := C.ggml_backend_alloc_ctx_tensors(c, b)
|
||||
for _, t := range f.Tensors().Items {
|
||||
if _, err := r.Seek(int64(f.Tensors().Offset+t.Offset), io.SeekStart); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
n, err := io.CopyN(&b, r, int64(t.Size()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if n != int64(t.Size()) {
|
||||
return nil, fmt.Errorf("expected %d bytes, got %d", t.Size(), n)
|
||||
}
|
||||
|
||||
func() {
|
||||
cname := C.CString(t.Name)
|
||||
defer C.free(unsafe.Pointer(cname))
|
||||
|
||||
cbytes := C.CBytes(b.Bytes())
|
||||
defer C.free(cbytes)
|
||||
|
||||
C.ggml_backend_tensor_set(C.ggml_get_tensor(c, cname), cbytes, 0, C.size_t(n))
|
||||
}()
|
||||
}
|
||||
|
||||
return &Backend{c, b, bb, f.KV(), f.Tensors()}, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
ml.RegisterBackend("ggml", New)
|
||||
}
|
||||
|
||||
func (b *Backend) Config() ml.Config {
|
||||
return b.KV
|
||||
}
|
||||
|
||||
func (b *Backend) Get(name string) ml.Tensor {
|
||||
cname := C.CString(name)
|
||||
defer C.free(unsafe.Pointer(cname))
|
||||
if t := C.ggml_get_tensor(b.c, cname); t != nil {
|
||||
return &Tensor{t}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *Backend) NewContext() ml.Context {
|
||||
n := max(8192, len(b.Tensors.Items)*5)
|
||||
bts := make([]byte, C.size_t(n)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(n), false))
|
||||
c := C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_buffer: unsafe.Pointer(&bts[0]),
|
||||
mem_size: C.size_t(len(bts)),
|
||||
no_alloc: true,
|
||||
})
|
||||
return &Context{
|
||||
b: b.b,
|
||||
c: c,
|
||||
g: C.ggml_new_graph_custom(c, C.size_t(n), false),
|
||||
}
|
||||
}
|
||||
|
||||
type Context struct {
|
||||
b *C.struct_ggml_backend
|
||||
c *C.struct_ggml_context
|
||||
g *C.struct_ggml_cgraph
|
||||
}
|
||||
|
||||
func (c *Context) Forward(t ml.Tensor) {
|
||||
C.ggml_build_forward_expand(c.g, t.(*Tensor).t)
|
||||
}
|
||||
|
||||
func (c *Context) Compute(t ml.Tensor) ml.Tensor {
|
||||
c.Forward(t)
|
||||
|
||||
a := C.ggml_gallocr_new(C.ggml_backend_get_default_buffer_type(c.b))
|
||||
C.ggml_gallocr_alloc_graph(a, c.g)
|
||||
slog.Debug("compute graph memory", "require", format.HumanBytes2(uint64(C.ggml_gallocr_get_buffer_size(a, 0))))
|
||||
|
||||
C.ggml_backend_graph_compute(c.b, c.g)
|
||||
return &Tensor{
|
||||
C.ggml_graph_node(c.g, C.ggml_graph_n_nodes(c.g)-1),
|
||||
}
|
||||
}
|
||||
|
||||
func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
|
||||
if len(shape) < 1 || len(shape) > 4 {
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
|
||||
for _, dim := range shape {
|
||||
if dim < 1 {
|
||||
panic("invalid shape")
|
||||
}
|
||||
}
|
||||
|
||||
var t *C.struct_ggml_tensor
|
||||
switch dtype {
|
||||
case ml.DTypeF32:
|
||||
t = C.ggml_new_tensor(c.c, C.GGML_TYPE_F32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
|
||||
case ml.DTypeI32:
|
||||
t = C.ggml_new_tensor(c.c, C.GGML_TYPE_I32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
|
||||
default:
|
||||
panic("unsupported dtype")
|
||||
}
|
||||
|
||||
b := C.ggml_backend_alloc_buffer(c.b, C.ggml_nbytes(t))
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
C.ggml_set_f32(t, 0.)
|
||||
return &Tensor{t}
|
||||
}
|
||||
|
||||
func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
|
||||
n := len(s)
|
||||
for _, v := range shape {
|
||||
n /= v
|
||||
}
|
||||
|
||||
if n != 1 {
|
||||
return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
|
||||
}
|
||||
|
||||
t := C.ggml_new_tensor(ctx.c, dtype, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
|
||||
b := C.ggml_backend_alloc_buffer(ctx.b, C.ggml_nbytes(t))
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
|
||||
return &Tensor{t}, nil
|
||||
}
|
||||
|
||||
func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
|
||||
return fromSlice(c, s, shape, C.GGML_TYPE_F32)
|
||||
}
|
||||
|
||||
func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
return fromSlice(c, s, shape, C.GGML_TYPE_I32)
|
||||
}
|
||||
|
||||
func (c *Context) Close() error {
|
||||
C.ggml_free(c.c)
|
||||
return nil
|
||||
}
|
||||
|
||||
type Tensor struct {
|
||||
t *C.struct_ggml_tensor
|
||||
}
|
||||
|
||||
func (t *Tensor) LogValue() slog.Value {
|
||||
return slog.GroupValue(
|
||||
slog.String("name", C.GoString(C.ggml_get_name(t.t))),
|
||||
slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
|
||||
slog.Any("shape", t.Shape()),
|
||||
)
|
||||
}
|
||||
|
||||
func (t *Tensor) Dim(n int) int64 {
|
||||
return int64(t.t.ne[n])
|
||||
}
|
||||
|
||||
func (t *Tensor) Stride(n int) int64 {
|
||||
return int64(t.t.nb[n])
|
||||
}
|
||||
|
||||
func (t *Tensor) Shape() []int64 {
|
||||
shape := make([]int64, C.ggml_n_dims(t.t))
|
||||
for i := range shape {
|
||||
shape[i] = t.Dim(i)
|
||||
}
|
||||
|
||||
return shape
|
||||
}
|
||||
|
||||
func (t *Tensor) Bytes() []byte {
|
||||
if bts := C.ggml_get_data(t.t); bts != nil {
|
||||
return C.GoBytes(bts, C.int(C.ggml_nbytes(t.t)))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Tensor) Floats() []float32 {
|
||||
if s := C.ggml_get_data_f32(t.t); s != nil {
|
||||
f32s := make([]float32, C.ggml_nelements(t.t))
|
||||
for i, v := range unsafe.Slice(s, C.ggml_nelements(t.t)) {
|
||||
f32s[i] = float32(v)
|
||||
}
|
||||
|
||||
return f32s
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Tensor) DType() ml.DType {
|
||||
switch t.t._type {
|
||||
case C.GGML_TYPE_F32:
|
||||
return ml.DTypeF32
|
||||
case C.GGML_TYPE_I32:
|
||||
return ml.DTypeI32
|
||||
default:
|
||||
return ml.DTypeOther
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_add(ctx.(*Context).c, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
|
||||
if len(s) > 0 {
|
||||
return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
|
||||
}
|
||||
|
||||
return t
|
||||
}
|
||||
|
||||
func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_concat(ctx.(*Context).c, t.t, t2.(*Tensor).t, C.int(dim)),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_cont(ctx.(*Context).c, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_mul(ctx.(*Context).c, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_mul_mat(ctx.(*Context).c, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Norm(ctx ml.Context, eps float32) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_norm(ctx.(*Context).c, t.t, (C.float)(eps)),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) RMSNorm(ctx ml.Context, eps float32) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_rms_norm(ctx.(*Context).c, t.t, C.float(eps)),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Pad(ctx ml.Context, shape ...int64) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
C.ggml_pad(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
C.ggml_permute(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_get_rows(ctx.(*Context).c, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_cpy(ctx.(*Context).c, t.t, t2.(*Tensor).t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Reshape(ctx ml.Context, shape ...int64) ml.Tensor {
|
||||
switch len(shape) {
|
||||
case 1:
|
||||
return &Tensor{
|
||||
C.ggml_reshape_1d(ctx.(*Context).c, t.t, C.int64_t(shape[0])),
|
||||
}
|
||||
case 2:
|
||||
return &Tensor{
|
||||
C.ggml_reshape_2d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
|
||||
}
|
||||
case 3:
|
||||
return &Tensor{
|
||||
C.ggml_reshape_3d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
|
||||
}
|
||||
case 4:
|
||||
return &Tensor{
|
||||
C.ggml_reshape_4d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
|
||||
}
|
||||
default:
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_scale(ctx.(*Context).c, t.t, (C.float)(s)),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_soft_max(ctx.(*Context).c, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_tanh_inplace(ctx.(*Context).c, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Unpad(ctx ml.Context, shape ...int64) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
C.ggml_unpad(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
|
||||
switch len(shape) {
|
||||
case 1:
|
||||
return &Tensor{
|
||||
C.ggml_view_1d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.size_t(offset)),
|
||||
}
|
||||
case 3:
|
||||
return &Tensor{
|
||||
C.ggml_view_2d(ctx.(*Context).c, t.t,
|
||||
C.int64_t(shape[0]), C.int64_t(shape[2]),
|
||||
C.size_t(shape[1]),
|
||||
C.size_t(offset)),
|
||||
}
|
||||
case 5:
|
||||
return &Tensor{
|
||||
C.ggml_view_3d(ctx.(*Context).c, t.t,
|
||||
C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
|
||||
C.size_t(shape[1]), C.size_t(shape[3]),
|
||||
C.size_t(offset)),
|
||||
}
|
||||
case 7:
|
||||
return &Tensor{
|
||||
C.ggml_view_4d(ctx.(*Context).c, t.t,
|
||||
C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
|
||||
C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
|
||||
C.size_t(offset)),
|
||||
}
|
||||
default:
|
||||
panic("unsupported number of dimensions")
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
ropeTypeNorm C.int = iota
|
||||
)
|
||||
|
||||
func (t *Tensor) Rope(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_rope_ext(
|
||||
ctx.(*Context).c, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
|
||||
C.int(ropeDim),
|
||||
131072, // YaRN n_ctx_train
|
||||
ropeTypeNorm, // ROPE_TYPE_NORM
|
||||
C.float(ropeBase),
|
||||
C.float(ropeScale),
|
||||
0., // YaRN ext_factor
|
||||
1., // YaRN attn_factor
|
||||
32., // YaRN beta_fast
|
||||
1., // YaRN beta_slow
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_gelu_inplace(ctx.(*Context).c, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_silu_inplace(ctx.(*Context).c, t.t),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
|
||||
return &Tensor{
|
||||
C.ggml_conv_2d(ctx.(*Context).c, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
|
||||
}
|
||||
}
|
8
ml/backend/ggml/backend_cpu.go
Normal file
8
ml/backend/ggml/backend_cpu.go
Normal file
@ -0,0 +1,8 @@
|
||||
package ggml
|
||||
|
||||
// #include "ggml-backend.h"
|
||||
import "C"
|
||||
|
||||
func newCPUBackend() *C.struct_ggml_backend {
|
||||
return C.ggml_backend_cpu_init()
|
||||
}
|
13
ml/backend/ggml/backend_darwin.go
Normal file
13
ml/backend/ggml/backend_darwin.go
Normal file
@ -0,0 +1,13 @@
|
||||
package ggml
|
||||
|
||||
//go:generate sh -c "echo \"// Code generated $(date). DO NOT EDIT.\n\" >ggml-metal-embed.metal"
|
||||
//go:generate sh -c "sed -e '/#include \"ggml-common.h\"/r ggml-common.h' -e '/#include \"ggml-common.h\"/d' ggml-metal.metal >>ggml-metal-embed.metal"
|
||||
|
||||
// #cgo arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DGGML_METAL_NDEBUG
|
||||
// #cgo arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
|
||||
// #include "ggml-metal.h"
|
||||
import "C"
|
||||
|
||||
func newBackend() *C.struct_ggml_backend {
|
||||
return C.ggml_backend_metal_init()
|
||||
}
|
6
ml/backend/ggml/backend_debug.go
Normal file
6
ml/backend/ggml/backend_debug.go
Normal file
@ -0,0 +1,6 @@
|
||||
//go:build debug
|
||||
|
||||
package ggml
|
||||
|
||||
// #cgo CPPFLAGS: -DOLLAMA_DEBUG
|
||||
import "C"
|
10
ml/backend/ggml/backend_linux.go
Normal file
10
ml/backend/ggml/backend_linux.go
Normal file
@ -0,0 +1,10 @@
|
||||
package ggml
|
||||
|
||||
// #cgo CPPFLAGS: -D_GNU_SOURCE
|
||||
// #cgo LDFLAGS: -lm
|
||||
// #include "ggml-backend.h"
|
||||
import "C"
|
||||
|
||||
func newBackend() *C.struct_ggml_backend {
|
||||
return newCPUBackend()
|
||||
}
|
10
ml/backend/ggml/backend_windows.go
Normal file
10
ml/backend/ggml/backend_windows.go
Normal file
@ -0,0 +1,10 @@
|
||||
package ggml
|
||||
|
||||
// #cgo CPPFLAGS: -D_WIN32_WINNT=0x602
|
||||
// #cgo LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++
|
||||
// #include "ggml-backend.h"
|
||||
import "C"
|
||||
|
||||
func newBackend() *C.struct_ggml_backend {
|
||||
return newCPUBackend()
|
||||
}
|
3235
ml/backend/ggml/ggml-aarch64.c
vendored
Normal file
3235
ml/backend/ggml/ggml-aarch64.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
65
ml/backend/ggml/ggml-aarch64.h
vendored
Normal file
65
ml/backend/ggml/ggml-aarch64.h
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
1068
ml/backend/ggml/ggml-alloc.c
vendored
Normal file
1068
ml/backend/ggml/ggml-alloc.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
102
ml/backend/ggml/ggml-alloc.h
vendored
Normal file
102
ml/backend/ggml/ggml-alloc.h
vendored
Normal file
@ -0,0 +1,102 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
// Tensor allocator
|
||||
struct ggml_tallocr {
|
||||
ggml_backend_buffer_t buffer;
|
||||
void * base;
|
||||
size_t alignment;
|
||||
size_t offset;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
||||
|
||||
// Graph allocator
|
||||
/*
|
||||
Example usage:
|
||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
|
||||
|
||||
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
||||
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
||||
|
||||
// allocate the graph
|
||||
struct ggml_cgraph * graph = build_graph(batch);
|
||||
ggml_gallocr_alloc_graph(galloc, graph);
|
||||
|
||||
printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
|
||||
|
||||
// evaluate the graph
|
||||
ggml_backend_graph_compute(backend, graph);
|
||||
*/
|
||||
|
||||
// special tensor flags for use with the graph allocator:
|
||||
// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
|
||||
// ggml_set_output(): output tensors are never freed and never overwritten
|
||||
|
||||
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||
|
||||
GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
|
||||
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
||||
|
||||
// pre-allocate buffers from a measure graph - does not allocate or modify the graph
|
||||
// call with a worst-case graph to avoid buffer reallocations
|
||||
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
||||
// returns false if the buffer allocation failed
|
||||
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
GGML_API bool ggml_gallocr_reserve_n(
|
||||
ggml_gallocr_t galloc,
|
||||
struct ggml_cgraph * graph,
|
||||
const int * node_buffer_ids,
|
||||
const int * leaf_buffer_ids);
|
||||
|
||||
// automatic reallocation if the topology changes when using a single buffer
|
||||
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
||||
GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
||||
|
||||
GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
|
||||
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
180
ml/backend/ggml/ggml-backend-impl.h
vendored
Normal file
180
ml/backend/ggml/ggml-backend-impl.h
vendored
Normal file
@ -0,0 +1,180 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// ggml-backend internal header
|
||||
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
// buffer type
|
||||
typedef void * ggml_backend_buffer_type_context_t;
|
||||
|
||||
struct ggml_backend_buffer_type_i {
|
||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||
// allocate a buffer of this type
|
||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
// tensor alignment
|
||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
||||
// max buffer size that can be allocated
|
||||
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
||||
// data size needed to allocate the tensor, including padding
|
||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
// check if tensor data is in host memory
|
||||
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer_type {
|
||||
struct ggml_backend_buffer_type_i iface;
|
||||
ggml_backend_buffer_type_context_t context;
|
||||
};
|
||||
|
||||
// buffer
|
||||
typedef void * ggml_backend_buffer_context_t;
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
||||
void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer);
|
||||
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
||||
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer {
|
||||
struct ggml_backend_buffer_i iface;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
ggml_backend_buffer_context_t context;
|
||||
size_t size;
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
};
|
||||
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
ggml_backend_buffer_context_t context,
|
||||
size_t size);
|
||||
|
||||
// do not use directly, use ggml_backend_tensor_copy instead
|
||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// buffer that contains a collection of buffers
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
|
||||
//
|
||||
// Backend
|
||||
//
|
||||
|
||||
typedef void * ggml_backend_context_t;
|
||||
|
||||
struct ggml_backend_i {
|
||||
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
||||
|
||||
void (*GGML_CALL free)(ggml_backend_t backend);
|
||||
|
||||
// buffer allocation
|
||||
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
||||
|
||||
// (optional) asynchronous tensor data access
|
||||
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// (optional) complete all pending operations
|
||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||
|
||||
// compute graph with a plan (not used currently)
|
||||
// create a new plan for a graph
|
||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
||||
// compute the graph with the plan
|
||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
// compute graph without a plan (async)
|
||||
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// check if the backend can compute an operation
|
||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// check if the backend can use tensors allocated in a buffer type
|
||||
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
|
||||
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
||||
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
||||
// even if the weight has to be copied from the CPU temporarily
|
||||
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
// create a new event that can record events on this backend instance
|
||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||
// record an event on the backend instance that created it
|
||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
||||
// wait for an event on on a different backend instance
|
||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
// block until an event is recorded
|
||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
ggml_guid_t guid;
|
||||
|
||||
struct ggml_backend_i iface;
|
||||
ggml_backend_context_t context;
|
||||
};
|
||||
|
||||
struct ggml_backend_event {
|
||||
ggml_backend_t backend;
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
||||
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
2325
ml/backend/ggml/ggml-backend.c
vendored
Normal file
2325
ml/backend/ggml/ggml-backend.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
267
ml/backend/ggml/ggml-backend.h
vendored
Normal file
267
ml/backend/ggml/ggml-backend.h
vendored
Normal file
@ -0,0 +1,267 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend_event * ggml_backend_event_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
// buffer type
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
|
||||
// buffer
|
||||
enum ggml_backend_buffer_usage {
|
||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
//
|
||||
// Backend
|
||||
//
|
||||
|
||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
// "offset" refers to the offset of the tensor data for setting/getting data
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// asynchronous copy
|
||||
// the copy is performed after all the currently queued operations in backend_src
|
||||
// backend_dst will wait for the copy to complete before performing other operations
|
||||
// automatic fallback to sync copy if async is not supported
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// events
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
||||
|
||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
//
|
||||
|
||||
// The backend scheduler allows for multiple backends to be used together
|
||||
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
||||
// The backends are selected based on:
|
||||
// - the backend that supports the operation
|
||||
// - the location of the pre-allocated tensors (e.g. the weights)
|
||||
/*
|
||||
Example usage:
|
||||
|
||||
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
||||
// preferrably to run on the same backend as the buffer
|
||||
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
|
||||
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
|
||||
|
||||
// initialize buffers from a max size graph (optional)
|
||||
reserve_graph = build_graph(sched, max_batch_size);
|
||||
|
||||
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
||||
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
||||
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
|
||||
|
||||
ggml_backend_sched_reserve(sched, reserve_graph);
|
||||
|
||||
// compute
|
||||
graph = build_graph(sched);
|
||||
ggml_backend_sched_graph_compute(sched, graph);
|
||||
|
||||
// if there are graph inputs:
|
||||
ggml_backend_sched_reset(sched);
|
||||
ggml_backend_sched_alloc_graph(sched, graph);
|
||||
ggml_backend_tensor_set(input_tensor, ...);
|
||||
ggml_backend_sched_graph_compute(sched, graph);
|
||||
}
|
||||
*/
|
||||
|
||||
struct ggml_backend_sched;
|
||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||
//
|
||||
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
||||
// if the user returns false, the scheduler will cancel the graph compute
|
||||
//
|
||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
// Initialize a backend scheduler
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
||||
|
||||
// Get the number of splits of the last graph
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
// Allocate and compute graph on the backend scheduler
|
||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||
|
||||
// Reset all assignments and allocators - must be called before changing the node backends
|
||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||
|
||||
// Set a callback to be called for each resulting node during graph compute
|
||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
||||
|
||||
//
|
||||
// Utils
|
||||
//
|
||||
|
||||
struct ggml_backend_graph_copy {
|
||||
ggml_backend_buffer_t buffer;
|
||||
struct ggml_context * ctx_allocated;
|
||||
struct ggml_context * ctx_unallocated;
|
||||
struct ggml_cgraph * graph;
|
||||
};
|
||||
|
||||
// Copy a graph to a different backend
|
||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||
|
||||
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
|
||||
// Compare the output of two backends
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
49
ml/backend/ggml/ggml-blas.h
vendored
Normal file
49
ml/backend/ggml/ggml-blas.h
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||
|
||||
// number of threads used for conversion to float
|
||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
1879
ml/backend/ggml/ggml-common.h
vendored
Normal file
1879
ml/backend/ggml/ggml-common.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
640
ml/backend/ggml/ggml-cpu-impl.h
vendored
Normal file
640
ml/backend/ggml/ggml-cpu-impl.h
vendored
Normal file
@ -0,0 +1,640 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
//#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
|
||||
#else
|
||||
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
|
||||
#else
|
||||
|
||||
typedef __fp16 ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddlvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
ggml_fp16_internal_t tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
ggml_fp16_internal_t tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch64)
|
||||
#if defined(__loongarch_asx)
|
||||
#include <lasxintrin.h>
|
||||
#endif
|
||||
#if defined(__loongarch_sx)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
|
||||
typedef union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} ft_union;
|
||||
|
||||
/* float type data load instructions */
|
||||
static __m128 __lsx_vreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
|
||||
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
75
ml/backend/ggml/ggml-cuda.h
vendored
Normal file
75
ml/backend/ggml/ggml-cuda.h
vendored
Normal file
@ -0,0 +1,75 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef GGML_USE_HIPBLAS
|
||||
#define GGML_CUDA_NAME "ROCm"
|
||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_NAME "MUSA"
|
||||
#define GGML_CUBLAS_NAME "muBLAS"
|
||||
#else
|
||||
#define GGML_CUDA_NAME "CUDA"
|
||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
// device buffer
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
110
ml/backend/ggml/ggml-debug.c
vendored
Normal file
110
ml/backend/ggml/ggml-debug.c
vendored
Normal file
@ -0,0 +1,110 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "ggml-debug.h"
|
||||
|
||||
static int mul(int64_t *dims, int ndims) {
|
||||
int result = 1;
|
||||
for (int i = 0; i < ndims; i++) {
|
||||
result *= dims[i];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void repeat(char c, int n) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
fprintf(stderr, "%c", c);
|
||||
}
|
||||
}
|
||||
|
||||
static void print_tensor(const void *tensor, void (*cb)(const void *, int),
|
||||
int shape,
|
||||
int64_t *dims, int ndims, int stride,
|
||||
int nitems, int pad) {
|
||||
fprintf(stderr, "[");
|
||||
for (int i = 0; i < dims[0]; i++) {
|
||||
if (i >= nitems && i < dims[0] - nitems) {
|
||||
fprintf(stderr, "... (%lld more), ", dims[0] - 2 * nitems);
|
||||
int skip = dims[0] - 2 * nitems;
|
||||
if (ndims > 1) {
|
||||
stride += mul(dims + 1, ndims - 1) * skip;
|
||||
repeat('\n', ndims - 1);
|
||||
repeat(' ', shape - ndims + 1 + pad);
|
||||
}
|
||||
i += skip - 1;
|
||||
} else if (ndims > 1) {
|
||||
print_tensor(tensor, cb, shape, dims + 1, ndims - 1, stride,
|
||||
nitems, pad);
|
||||
stride += mul(dims + 1, ndims - 1);
|
||||
if (i < dims[0] - 1) {
|
||||
fprintf(stderr, ", ");
|
||||
repeat('\n', ndims - 1);
|
||||
repeat(' ', shape - ndims + 1 + pad);
|
||||
}
|
||||
} else {
|
||||
cb(tensor, stride + i);
|
||||
if (i < dims[0] - 1) {
|
||||
fprintf(stderr, ", ");
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "]");
|
||||
}
|
||||
|
||||
static void print_tensor_f16(const void *tensor, int i) {
|
||||
fprintf(stderr, "%f", ggml_fp16_to_fp32(((const ggml_fp16_t *)tensor)[i]));
|
||||
}
|
||||
|
||||
static void print_tensor_f32(const void *tensor, int i) {
|
||||
fprintf(stderr, "%f", ((const float *)tensor)[i]);
|
||||
}
|
||||
|
||||
static void print_tensor_i32(const void *tensor, int i) {
|
||||
fprintf(stderr, "%d", ((const int32_t *)tensor)[i]);
|
||||
}
|
||||
|
||||
static void ggml_debug_tensor(const struct ggml_tensor *tensor, bool verbose, const char *prefix, int indent) {
|
||||
fprintf(stderr, "%s%s %s (%s): [%lld %lld %lld %lld]\n", prefix, tensor->name,
|
||||
ggml_op_name(tensor->op), ggml_type_name(tensor->type), tensor->ne[0],
|
||||
tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
|
||||
if (!verbose) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < indent; i++) {
|
||||
fprintf(stderr, " ");
|
||||
}
|
||||
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F16:
|
||||
print_tensor(ggml_get_data(tensor), print_tensor_f16, ggml_n_dims(tensor),
|
||||
(int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
print_tensor(ggml_get_data(tensor), print_tensor_f32, ggml_n_dims(tensor),
|
||||
(int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
print_tensor(ggml_get_data(tensor), print_tensor_i32, ggml_n_dims(tensor),
|
||||
(int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "<unsupported type>\n");
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
void ggml_debug(const struct ggml_tensor *tensor, bool verbose) {
|
||||
ggml_debug_tensor(tensor, verbose, ">>> ", 4);
|
||||
|
||||
if (tensor->src[0] != NULL) {
|
||||
ggml_debug_tensor(tensor->src[0], verbose, " ?? ", 4);
|
||||
}
|
||||
|
||||
if (tensor->src[1] != NULL) {
|
||||
ggml_debug_tensor(tensor->src[1], verbose, " ?? ", 4);
|
||||
}
|
||||
}
|
3
ml/backend/ggml/ggml-debug.h
vendored
Normal file
3
ml/backend/ggml/ggml-debug.h
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
#include "ggml.h"
|
||||
|
||||
void ggml_debug(const struct ggml_tensor *tensor, bool verbose);
|
212
ml/backend/ggml/ggml-impl.h
vendored
Normal file
212
ml/backend/ggml/ggml-impl.h
vendored
Normal file
@ -0,0 +1,212 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||
#ifndef __cplusplus
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||
|
||||
static size_t ggml_bitset_size(size_t n) {
|
||||
return (n + BITSET_MASK) >> BITSET_SHR;
|
||||
}
|
||||
|
||||
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
|
||||
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
|
||||
}
|
||||
|
||||
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
|
||||
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
|
||||
}
|
||||
|
||||
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
|
||||
}
|
||||
|
||||
// hash set
|
||||
|
||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||
|
||||
// returns the minimum size for a hash set that can hold min_sz elements
|
||||
size_t ggml_hash_size(size_t min_sz);
|
||||
|
||||
// remove all elements from the hash set
|
||||
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
||||
|
||||
// returns true if key is in the hash set
|
||||
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// return index, asserts if table is full
|
||||
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// hash function for ggml_tensor
|
||||
static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
||||
// the last 4 bits are always zero due to alignment
|
||||
return (size_t)(uintptr_t)p >> 4;
|
||||
}
|
||||
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t h = ggml_hash(key) % hash_set->size;
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
|
||||
i = (i + 1) % hash_set->size;
|
||||
if (i == h) {
|
||||
// visited all hash table entries -> not found
|
||||
return GGML_HASHSET_FULL;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t i = ggml_hash_find(hash_set, key);
|
||||
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
|
||||
}
|
||||
|
||||
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t h = ggml_hash(key) % hash_set->size;
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
do {
|
||||
if (!ggml_bitset_get(hash_set->used, i)) {
|
||||
ggml_bitset_set(hash_set->used, i);
|
||||
hash_set->keys[i] = key;
|
||||
return i;
|
||||
}
|
||||
if (hash_set->keys[i] == key) {
|
||||
return GGML_HASHSET_ALREADY_EXISTS;
|
||||
}
|
||||
i = (i + 1) % hash_set->size;
|
||||
} while (i != h);
|
||||
|
||||
// visited all hash table entries -> not found
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
size_t h = ggml_hash(key) % hash_set->size;
|
||||
|
||||
// linear probing
|
||||
size_t i = h;
|
||||
do {
|
||||
if (!ggml_bitset_get(hash_set->used, i)) {
|
||||
ggml_bitset_set(hash_set->used, i);
|
||||
hash_set->keys[i] = key;
|
||||
return i;
|
||||
}
|
||||
if (hash_set->keys[i] == key) {
|
||||
return i;
|
||||
}
|
||||
i = (i + 1) % hash_set->size;
|
||||
} while (i != h);
|
||||
|
||||
// visited all hash table entries -> not found
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// computation graph
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
8325
ml/backend/ggml/ggml-metal-embed.metal
vendored
Normal file
8325
ml/backend/ggml/ggml-metal-embed.metal
vendored
Normal file
File diff suppressed because it is too large
Load Diff
88
ml/backend/ggml/ggml-metal.h
vendored
Normal file
88
ml/backend/ggml/ggml-metal.h
vendored
Normal file
@ -0,0 +1,88 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
// An interface allowing to compute ggml_cgraph with Metal
|
||||
//
|
||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
|
||||
//
|
||||
// How it works?
|
||||
//
|
||||
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
|
||||
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
|
||||
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
|
||||
//
|
||||
// You only need to make sure that all memory buffers that you used during the graph creation
|
||||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
||||
//
|
||||
// Synchronization between device and host memory (for example for input and output tensors)
|
||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
struct ggml_tensor;
|
||||
struct ggml_cgraph;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// backend API
|
||||
// user-code should use only these functions
|
||||
//
|
||||
|
||||
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||
|
||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
// ideally, the user code should be doing these checks
|
||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
6445
ml/backend/ggml/ggml-metal.metal
vendored
Normal file
6445
ml/backend/ggml/ggml-metal.metal
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3669
ml/backend/ggml/ggml-metal_darwin_arm64.m
vendored
Normal file
3669
ml/backend/ggml/ggml-metal_darwin_arm64.m
vendored
Normal file
File diff suppressed because it is too large
Load Diff
6
ml/backend/ggml/ggml-metal_darwin_arm64.s
Normal file
6
ml/backend/ggml/ggml-metal_darwin_arm64.s
Normal file
@ -0,0 +1,6 @@
|
||||
.section __DATA, __ggml_metallib
|
||||
.globl _ggml_metallib_start
|
||||
_ggml_metallib_start:
|
||||
.incbin "ggml-metal-embed.metal"
|
||||
.globl _ggml_metallib_end
|
||||
_ggml_metallib_end:
|
15778
ml/backend/ggml/ggml-quants.c
vendored
Normal file
15778
ml/backend/ggml/ggml-quants.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
173
ml/backend/ggml/ggml-quants.h
vendored
Normal file
173
ml/backend/ggml/ggml-quants.h
vendored
Normal file
@ -0,0 +1,173 @@
|
||||
/**
|
||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
void iq2xs_init_impl(enum ggml_type type);
|
||||
void iq2xs_free_impl(enum ggml_type type);
|
||||
void iq3xs_init_impl(int grid_size);
|
||||
void iq3xs_free_impl(int grid_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
23355
ml/backend/ggml/ggml.c
vendored
Normal file
23355
ml/backend/ggml/ggml.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2595
ml/backend/ggml/ggml.h
vendored
Normal file
2595
ml/backend/ggml/ggml.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
14
ml/backend/ggml/sgemm.h
vendored
Normal file
14
ml/backend/ggml/sgemm.h
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||
const void *, int64_t, void *, int64_t, int, int,
|
||||
int, int, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
11
ml/nn/convolution.go
Normal file
11
ml/nn/convolution.go
Normal file
@ -0,0 +1,11 @@
|
||||
package nn
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
type Conv2D struct {
|
||||
Weight ml.Tensor `ggml:"weight"`
|
||||
}
|
||||
|
||||
func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
|
||||
return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
|
||||
}
|
11
ml/nn/embedding.go
Normal file
11
ml/nn/embedding.go
Normal file
@ -0,0 +1,11 @@
|
||||
package nn
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
type Embedding struct {
|
||||
Weight ml.Tensor `ggml:"weight"`
|
||||
}
|
||||
|
||||
func (m *Embedding) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
|
||||
return m.Weight.Rows(ctx, hiddenState)
|
||||
}
|
17
ml/nn/linear.go
Normal file
17
ml/nn/linear.go
Normal file
@ -0,0 +1,17 @@
|
||||
package nn
|
||||
|
||||
import "github.com/ollama/ollama/ml"
|
||||
|
||||
type Linear struct {
|
||||
Weight ml.Tensor `ggml:"weight"`
|
||||
Bias ml.Tensor `ggml:"bias"`
|
||||
}
|
||||
|
||||
func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
|
||||
t = m.Weight.Mulmat(ctx, t)
|
||||
if m.Bias != nil {
|
||||
t = t.Add(ctx, m.Bias)
|
||||
}
|
||||
|
||||
return t
|
||||
}
|
33
ml/nn/normalization.go
Normal file
33
ml/nn/normalization.go
Normal file
@ -0,0 +1,33 @@
|
||||
package nn
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type LayerNorm struct {
|
||||
Weight ml.Tensor `ggml:"weight"`
|
||||
Bias ml.Tensor `ggml:"bias"`
|
||||
}
|
||||
|
||||
func (m *LayerNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
|
||||
t = t.Norm(ctx, eps).Mul(ctx, m.Weight)
|
||||
if m.Bias != nil {
|
||||
t = t.Add(ctx, m.Bias)
|
||||
}
|
||||
|
||||
return t
|
||||
}
|
||||
|
||||
type RMSNorm struct {
|
||||
Weight ml.Tensor `ggml:"weight"`
|
||||
Bias ml.Tensor `ggml:"bias"`
|
||||
}
|
||||
|
||||
func (m *RMSNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
|
||||
t = t.RMSNorm(ctx, eps).Mul(ctx, m.Weight)
|
||||
if m.Bias != nil {
|
||||
t = t.Add(ctx, m.Bias)
|
||||
}
|
||||
|
||||
return t
|
||||
}
|
154
model/cmd/main.go
Normal file
154
model/cmd/main.go
Normal file
@ -0,0 +1,154 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"image"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/ollama/ollama/cache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
_ "github.com/ollama/ollama/model/llama"
|
||||
_ "github.com/ollama/ollama/model/mllama"
|
||||
"github.com/ollama/ollama/sample"
|
||||
)
|
||||
|
||||
var args struct {
|
||||
n int
|
||||
debug bool
|
||||
image string
|
||||
cache bool
|
||||
}
|
||||
|
||||
func temp() error {
|
||||
flag.IntVar(&args.n, "n", 10, "number of samples")
|
||||
flag.BoolVar(&args.debug, "debug", false, "enable debug logging")
|
||||
flag.StringVar(&args.image, "image", "", "path to image file")
|
||||
flag.BoolVar(&args.cache, "cache", false, "enable KV cache")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if len(flag.Args()) != 1 {
|
||||
return fmt.Errorf("usage: %s path/to/file <prompt\n", filepath.Base(os.Args[0]))
|
||||
}
|
||||
|
||||
level := slog.LevelInfo
|
||||
if args.debug {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
|
||||
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: level,
|
||||
AddSource: true,
|
||||
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
||||
if attr.Key == slog.SourceKey {
|
||||
source := attr.Value.Any().(*slog.Source)
|
||||
source.File = filepath.Base(source.File)
|
||||
}
|
||||
|
||||
return attr
|
||||
},
|
||||
})))
|
||||
|
||||
m, err := model.New(flag.Arg(0))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
prompt, err := io.ReadAll(os.Stdin)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
inputIDs, err := m.(model.TextProcessor).Encode(string(prompt))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var opts []model.OptionsFunc
|
||||
if args.cache {
|
||||
opts = append(opts, model.WithCache(&cache.Simple{
|
||||
Capacity: 2048,
|
||||
DType: ml.DTypeF32,
|
||||
}))
|
||||
}
|
||||
|
||||
if args.image != "" {
|
||||
if err := func() error {
|
||||
f, err := os.Open(args.image)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
img, _, err := image.Decode(f)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
opts = append(opts, model.WithImage(img))
|
||||
return nil
|
||||
}(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
var offset int
|
||||
for range args.n {
|
||||
logit, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
f32s := logit.Floats()
|
||||
f64s := make([]float64, len(f32s))
|
||||
for i, f32 := range f32s {
|
||||
f64s[i] = float64(f32)
|
||||
}
|
||||
|
||||
// do sampling
|
||||
f64s, err = sample.Sample(f64s, sample.Greedy())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var outputIDs []int32
|
||||
for _, f64 := range f64s {
|
||||
if !m.(model.TextProcessor).Is(uint32(f64), model.SpecialEOS) {
|
||||
outputIDs = append(outputIDs, int32(f64))
|
||||
}
|
||||
}
|
||||
|
||||
if len(outputIDs) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
s, err := m.(model.TextProcessor).Decode(outputIDs)
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Print(s)
|
||||
|
||||
inputIDs = append(inputIDs, outputIDs...)
|
||||
if args.cache {
|
||||
offset = len(inputIDs) - 1
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
if err := temp(); err != nil {
|
||||
fmt.Println("err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
147
model/llama/model.go
Normal file
147
model/llama/model.go
Normal file
@ -0,0 +1,147 @@
|
||||
package llama
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type Options struct {
|
||||
RopeFactors ml.Tensor `ggml:"rope_freqs.weight"`
|
||||
hiddenSize, numHeads, numKVHeads int64
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
}
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
|
||||
TextProcessor
|
||||
|
||||
TokenEmbedding *nn.Embedding `ggml:"token_embd"`
|
||||
Layers []Layer `ggml:"blk"`
|
||||
OutputNorm *nn.RMSNorm `ggml:"output_norm"`
|
||||
Output *nn.Linear `ggml:"output"`
|
||||
|
||||
*Options
|
||||
}
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
return &Model{
|
||||
TextProcessor: newTextProcessor(c),
|
||||
Layers: make([]Layer, c.Uint("block_count")),
|
||||
Options: &Options{
|
||||
hiddenSize: int64(c.Uint("embedding_length")),
|
||||
numHeads: int64(c.Uint("attention.head_count")),
|
||||
numKVHeads: int64(c.Uint("attention.head_count_kv")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
type SelfAttention struct {
|
||||
Query *nn.Linear `ggml:"attn_q"`
|
||||
Key *nn.Linear `ggml:"attn_k"`
|
||||
Value *nn.Linear `ggml:"attn_v"`
|
||||
Output *nn.Linear `ggml:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
q := sa.Query.Forward(ctx, hiddenState)
|
||||
q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
q = q.Rope(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
k := sa.Key.Forward(ctx, hiddenState)
|
||||
k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
k = k.Rope(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
v := sa.Value.Forward(ctx, hiddenState)
|
||||
v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
k, v = cache.Put(ctx, k, v, cache.Options)
|
||||
|
||||
q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
kq := k.Mulmat(ctx, q)
|
||||
kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
kq = kq.Softmax(ctx)
|
||||
|
||||
kqv := v.Mulmat(ctx, kq)
|
||||
kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
}
|
||||
|
||||
type MLP struct {
|
||||
Up *nn.Linear `ggml:"ffn_up"`
|
||||
Down *nn.Linear `ggml:"ffn_down"`
|
||||
Gate *nn.Linear `ggml:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
type Layer struct {
|
||||
AttentionNorm *nn.RMSNorm `ggml:"attn_norm"`
|
||||
SelfAttention *SelfAttention
|
||||
MLPNorm *nn.RMSNorm `ggml:"ffn_norm"`
|
||||
MLP *MLP
|
||||
}
|
||||
|
||||
func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
|
||||
|
||||
for i, layer := range m.Layers {
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positions, opts.Cache.Sub(i), m.Options)
|
||||
}
|
||||
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
hiddenState = m.Output.Forward(ctx, hiddenState)
|
||||
|
||||
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hiddenState.Rows(ctx, outputs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("llama", New)
|
||||
}
|
25
model/llama/process_text.go
Normal file
25
model/llama/process_text.go
Normal file
@ -0,0 +1,25 @@
|
||||
package llama
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type TextProcessor struct {
|
||||
model.BytePairEncoding
|
||||
}
|
||||
|
||||
func newTextProcessor(c ml.Config) TextProcessor {
|
||||
return TextProcessor{
|
||||
BytePairEncoding: model.BytePairEncoding{
|
||||
Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
Vocabulary: &model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
|
||||
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
90
model/mllama/model.go
Normal file
90
model/mllama/model.go
Normal file
@ -0,0 +1,90 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
|
||||
*VisionModel `ggml:"v,vision"`
|
||||
*TextModel
|
||||
|
||||
Projector *nn.Linear `ggml:"mm.0"`
|
||||
|
||||
ImageProcessor
|
||||
TextProcessor
|
||||
}
|
||||
|
||||
func New(c ml.Config) (model.Model, error) {
|
||||
return &Model{
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
TextProcessor: newTextProcessor(c),
|
||||
TextModel: newTextModel(c),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
|
||||
var crossAttentionStates ml.Tensor
|
||||
if opts.Images != nil {
|
||||
f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(opts.Images[0])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.FromFloatSlice(f32s,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.imageSize,
|
||||
m.ImageProcessor.numChannels,
|
||||
m.ImageProcessor.maxNumTiles,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
aspectRatio, err := ctx.FromIntSlice([]int32{int32(aspectRatioID)}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions := make([]int32, 1601)
|
||||
for i := range positions {
|
||||
positions[i] = int32(i)
|
||||
}
|
||||
|
||||
positionIDs, err := ctx.FromIntSlice(positions, len(positions))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
crossAttentionStates = m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
|
||||
crossAttentionStates = m.Projector.Forward(ctx, crossAttentionStates)
|
||||
}
|
||||
|
||||
inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// TODO: attention mask, cross attention mask
|
||||
hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache)
|
||||
|
||||
outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return hiddenState.Rows(ctx, outputs), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("mllama", New)
|
||||
}
|
225
model/mllama/model_text.go
Normal file
225
model/mllama/model_text.go
Normal file
@ -0,0 +1,225 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type TextSelfAttention struct {
|
||||
Query *nn.Linear `ggml:"attn_q"`
|
||||
Key *nn.Linear `ggml:"attn_k"`
|
||||
Value *nn.Linear `ggml:"attn_v"`
|
||||
Output *nn.Linear `ggml:"attn_output"`
|
||||
}
|
||||
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = query.Rope(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
key = key.Rope(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
key, value = cache.Put(ctx, key, value, cache.Options)
|
||||
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
|
||||
if mask != nil {
|
||||
scores = scores.Add(ctx, mask)
|
||||
}
|
||||
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type TextMLP struct {
|
||||
Up *nn.Linear `ggml:"ffn_up"`
|
||||
Down *nn.Linear `ggml:"ffn_down"`
|
||||
Gate *nn.Linear `ggml:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextModelOptions) ml.Tensor {
|
||||
hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
|
||||
return mlp.Down.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
type TextSelfAttentionDecoderLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `ggml:"attn_norm"`
|
||||
SelfAttention *TextSelfAttention
|
||||
|
||||
MLPNorm *nn.RMSNorm `ggml:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
}
|
||||
|
||||
func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type TextCrossAttention struct {
|
||||
QueryNorm *nn.RMSNorm `ggml:"cross_attn_q_norm"`
|
||||
Query *nn.Linear `ggml:"cross_attn_q_proj"`
|
||||
KeyNorm *nn.RMSNorm `ggml:"cross_attn_k_norm"`
|
||||
Key *nn.Linear `ggml:"cross_attn_k_proj"`
|
||||
Value *nn.Linear `ggml:"cross_attn_v_proj"`
|
||||
Output *nn.Linear `ggml:"cross_attn_o_proj"`
|
||||
}
|
||||
|
||||
func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenState.Dim(1)
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
|
||||
|
||||
query := ca.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
|
||||
query = ca.QueryNorm.Forward(ctx, query, opts.eps)
|
||||
|
||||
key := ca.Key.Forward(ctx, crossAttentionStates)
|
||||
key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
|
||||
key = ca.KeyNorm.Forward(ctx, key, opts.eps)
|
||||
|
||||
value := ca.Value.Forward(ctx, crossAttentionStates)
|
||||
value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
|
||||
|
||||
// TODO cache key, value
|
||||
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return ca.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type TextCrossAttentionDecoderLayer struct {
|
||||
AttentionNorm *nn.RMSNorm `ggml:"attn_norm"`
|
||||
CrossAttention *TextCrossAttention
|
||||
AttentionGate ml.Tensor `ggml:"cross_attn_attn_gate"`
|
||||
|
||||
MLPNorm *nn.RMSNorm `ggml:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
MLPGate ml.Tensor `ggml:"cross_attn_mlp_gate"`
|
||||
}
|
||||
|
||||
func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
|
||||
hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = hiddenState.Mul(ctx, d.MLPGate.Tanh(ctx))
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type TextDecoderLayer interface {
|
||||
Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor
|
||||
}
|
||||
|
||||
type TextDecoder struct {
|
||||
Layers []TextDecoderLayer
|
||||
}
|
||||
|
||||
func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
for i, layer := range d.Layers {
|
||||
if !slices.Contains(opts.crossAttentionLayers, uint32(i)) || crossAttentionStates != nil {
|
||||
hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), opts)
|
||||
}
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type TextModelOptions struct {
|
||||
RopeFactors ml.Tensor `ggml:"rope_freqs.weight"`
|
||||
|
||||
hiddenSize, numHeads, numKVHeads int64
|
||||
eps, ropeBase, ropeScale float32
|
||||
ropeDim uint32
|
||||
|
||||
crossAttentionLayers []uint32
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
TokenEmbedding *nn.Embedding `ggml:"token_embd"`
|
||||
Transformer *TextDecoder `ggml:"blk"`
|
||||
OutputNorm *nn.RMSNorm `ggml:"output_norm"`
|
||||
Output *nn.Linear `ggml:"output"`
|
||||
|
||||
*TextModelOptions
|
||||
}
|
||||
|
||||
func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache) ml.Tensor {
|
||||
hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
|
||||
hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
|
||||
hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
|
||||
return m.Output.Forward(ctx, hiddenState)
|
||||
}
|
||||
|
||||
func newTextModel(c ml.Config) *TextModel {
|
||||
var decoderLayers []TextDecoderLayer
|
||||
for i := range c.Uint("block_count") {
|
||||
var textDecoderLayer TextDecoderLayer
|
||||
if slices.Contains(c.Uints("attention.cross_attention_layers"), i) {
|
||||
textDecoderLayer = &TextCrossAttentionDecoderLayer{}
|
||||
} else {
|
||||
textDecoderLayer = &TextSelfAttentionDecoderLayer{}
|
||||
}
|
||||
|
||||
decoderLayers = append(decoderLayers, textDecoderLayer)
|
||||
}
|
||||
|
||||
return &TextModel{
|
||||
Transformer: &TextDecoder{Layers: decoderLayers},
|
||||
TextModelOptions: &TextModelOptions{
|
||||
hiddenSize: int64(c.Uint("embedding_length")),
|
||||
numHeads: int64(c.Uint("attention.head_count")),
|
||||
numKVHeads: int64(c.Uint("attention.head_count_kv")),
|
||||
eps: c.Float("attention.layer_norm_rms_epsilon"),
|
||||
ropeBase: c.Float("rope.freq_base"),
|
||||
ropeScale: c.Float("rope.freq_scale", 1),
|
||||
ropeDim: c.Uint("rope.dimension_count"),
|
||||
crossAttentionLayers: c.Uints("attention.cross_attention_layers"),
|
||||
},
|
||||
}
|
||||
}
|
234
model/mllama/model_vision.go
Normal file
234
model/mllama/model_vision.go
Normal file
@ -0,0 +1,234 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
)
|
||||
|
||||
var batchSize int64 = 1
|
||||
|
||||
type VisionSelfAttention struct {
|
||||
Query *nn.Linear `ggml:"attn_q"`
|
||||
Key *nn.Linear `ggml:"attn_k"`
|
||||
Value *nn.Linear `ggml:"attn_v"`
|
||||
Output *nn.Linear `ggml:"attn_out"`
|
||||
|
||||
Gate ml.Tensor `ggml:"attn_gate"`
|
||||
}
|
||||
|
||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
headDim := opts.hiddenSize / opts.numHeads
|
||||
|
||||
query := sa.Query.Forward(ctx, hiddenState)
|
||||
query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
|
||||
query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
key := sa.Key.Forward(ctx, hiddenState)
|
||||
key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
|
||||
key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
|
||||
value := sa.Value.Forward(ctx, hiddenState)
|
||||
value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
|
||||
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
scores := key.Mulmat(ctx, query)
|
||||
scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
|
||||
scores = scores.Softmax(ctx)
|
||||
|
||||
attention := value.Mulmat(ctx, scores)
|
||||
attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
|
||||
attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
|
||||
|
||||
hiddenState = sa.Output.Forward(ctx, attention)
|
||||
if sa.Gate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, sa.Gate)
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type VisionMLP struct {
|
||||
Down *nn.Linear `ggml:"ffn_down"`
|
||||
Up *nn.Linear `ggml:"ffn_up"`
|
||||
|
||||
Gate ml.Tensor `ggml:"ffn_gate"`
|
||||
}
|
||||
|
||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
|
||||
hiddenState = mlp.Up.Forward(ctx, hiddenState)
|
||||
if mlp.Gate != nil {
|
||||
hiddenState = hiddenState.Mul(ctx, mlp.Gate)
|
||||
}
|
||||
|
||||
return hiddenState
|
||||
}
|
||||
|
||||
type VisionEncoderLayer struct {
|
||||
AttentionNorm *nn.LayerNorm `ggml:"ln1"`
|
||||
SelfAttention *VisionSelfAttention
|
||||
|
||||
MLPNorm *nn.LayerNorm `ggml:"ln2"`
|
||||
MLP *VisionMLP
|
||||
}
|
||||
|
||||
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
residual := hiddenState
|
||||
|
||||
// self attention
|
||||
hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
|
||||
hiddenState = hiddenState.Add(ctx, residual)
|
||||
residual = hiddenState
|
||||
|
||||
// feed forward
|
||||
hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
|
||||
hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
|
||||
return hiddenState.Add(ctx, residual)
|
||||
}
|
||||
|
||||
type VisionEncoder struct {
|
||||
Layers []VisionEncoderLayer
|
||||
}
|
||||
|
||||
func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []uint32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
|
||||
var intermediateHiddenStates []ml.Tensor
|
||||
for i, layer := range e.Layers {
|
||||
if slices.Contains(intermediateLayersIndices, uint32(i)) {
|
||||
intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int64{1}, hiddenState.Shape()...)...))
|
||||
}
|
||||
|
||||
hiddenState = layer.Forward(ctx, hiddenState, opts)
|
||||
}
|
||||
|
||||
return hiddenState, intermediateHiddenStates
|
||||
}
|
||||
|
||||
type PrecomputedAspectRatioEmbedding struct {
|
||||
Embedding *nn.Embedding
|
||||
Gate ml.Tensor `ggml:"gate"`
|
||||
}
|
||||
|
||||
func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
|
||||
embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
|
||||
if e.Gate != nil {
|
||||
embeddings = embeddings.Mul(ctx, e.Gate)
|
||||
}
|
||||
|
||||
return hiddenState.Add(ctx, embeddings)
|
||||
}
|
||||
|
||||
type PrecomputedPositionEmbedding struct {
|
||||
PositionEmbedding *nn.Embedding `ggml:"position_embd"`
|
||||
PositionEmbeddingGate ml.Tensor `ggml:"position_embd.gate"`
|
||||
|
||||
TilePositionEmbedding *nn.Embedding `ggml:"tile_position_embd"`
|
||||
TilePositionEmbeddingGate ml.Tensor `ggml:"tile_position_embd.gate"`
|
||||
}
|
||||
|
||||
func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int64, opts *VisionModelOptions) ml.Tensor {
|
||||
positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
|
||||
if e.PositionEmbeddingGate != nil {
|
||||
positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
|
||||
}
|
||||
|
||||
hiddenState = hiddenState.Add(ctx, positionEmbedding)
|
||||
|
||||
tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
|
||||
tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
|
||||
if e.TilePositionEmbeddingGate != nil {
|
||||
tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
|
||||
}
|
||||
|
||||
return hiddenState.Add(ctx, tilePositionEmbedding)
|
||||
}
|
||||
|
||||
type VisionModelOptions struct {
|
||||
hiddenSize, numHeads, numTiles int64
|
||||
imageSize, patchSize int
|
||||
eps float32
|
||||
|
||||
intermediateLayersIndices []uint32
|
||||
}
|
||||
|
||||
type VisionModel struct {
|
||||
PatchEmbeddings *nn.Conv2D `ggml:"patch_embd"`
|
||||
|
||||
PreTilePositionEmbedding *PrecomputedAspectRatioEmbedding `ggml:"pre_tile_position_embd"`
|
||||
PostTilePositionEmbedding *PrecomputedAspectRatioEmbedding `ggml:"post_tile_position_embd"`
|
||||
PositionEmbedding *PrecomputedPositionEmbedding
|
||||
|
||||
PreLayerNorm *nn.LayerNorm `ggml:"pre_ln"`
|
||||
PostLayerNorm *nn.LayerNorm `ggml:"post_ln"`
|
||||
ClassEmbedding ml.Tensor `ggml:"class_embd"`
|
||||
|
||||
Transformer *VisionEncoder `ggml:"blk"`
|
||||
GlobalTransformer *VisionEncoder `ggml:"global.blk"`
|
||||
|
||||
*VisionModelOptions
|
||||
}
|
||||
|
||||
func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
|
||||
numPatches := int64((m.imageSize / m.patchSize) * (m.imageSize / m.patchSize))
|
||||
numPositions := numPatches
|
||||
if m.ClassEmbedding != nil {
|
||||
numPositions++
|
||||
}
|
||||
|
||||
hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
|
||||
hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
|
||||
hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||
hiddenState = m.ClassEmbedding.Stack(ctx, 2, slices.Repeat([]ml.Tensor{m.ClassEmbedding}, int(m.numTiles)-1)...).Concat(ctx, hiddenState, 1)
|
||||
|
||||
hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
|
||||
hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||
|
||||
numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
|
||||
hiddenState = hiddenState.Pad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), hiddenState.Dim(1)*hiddenState.Dim(2), batchSize)
|
||||
hiddenState, intermediateHiddenStates := m.Transformer.Forward(ctx, hiddenState, m.intermediateLayersIndices, m.VisionModelOptions)
|
||||
|
||||
hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
|
||||
hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
|
||||
|
||||
hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
|
||||
hiddenStates = hiddenStates.Reshape(ctx, int64(len(intermediateHiddenStates))*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
|
||||
hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
|
||||
hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
|
||||
return hiddenState.Concat(ctx, hiddenStates, 0)
|
||||
}
|
||||
|
||||
func newVisionModel(c ml.Config) *VisionModel {
|
||||
return &VisionModel{
|
||||
Transformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
|
||||
GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},
|
||||
|
||||
VisionModelOptions: &VisionModelOptions{
|
||||
hiddenSize: int64(c.Uint("vision.embedding_length")),
|
||||
numHeads: int64(c.Uint("vision.attention.head_count")),
|
||||
numTiles: int64(c.Uint("vision.max_num_tiles")),
|
||||
|
||||
imageSize: int(c.Uint("vision.image_size")),
|
||||
patchSize: int(c.Uint("vision.patch_size")),
|
||||
|
||||
eps: c.Float("vision.attention.layer_norm_epsilon"),
|
||||
|
||||
intermediateLayersIndices: c.Uints("vision.intermediate_layers_indices"),
|
||||
},
|
||||
}
|
||||
}
|
240
model/mllama/process_image.go
Normal file
240
model/mllama/process_image.go
Normal file
@ -0,0 +1,240 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"image"
|
||||
"image/color"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/image/draw"
|
||||
|
||||
"github.com/ollama/ollama/ml"
|
||||
)
|
||||
|
||||
type ImageProcessor struct {
|
||||
imageSize, numChannels, maxNumTiles int
|
||||
}
|
||||
|
||||
func newImageProcessor(c ml.Config) ImageProcessor {
|
||||
return ImageProcessor{
|
||||
imageSize: int(c.Uint("vision.image_size")),
|
||||
numChannels: int(c.Uint("vision.num_channels")),
|
||||
maxNumTiles: int(c.Uint("vision.max_num_tiles")),
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
|
||||
ratios := []image.Point{}
|
||||
|
||||
for w := range maxTiles {
|
||||
for h := range maxTiles {
|
||||
if (w+1)*(h+1) <= maxTiles {
|
||||
ratios = append(ratios, image.Point{w + 1, h + 1})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ratios
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) clip(a, a_min, a_max int) int {
|
||||
if a < a_min {
|
||||
return a_min
|
||||
} else if a > a_max {
|
||||
return a_max
|
||||
}
|
||||
|
||||
return a
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
||||
targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
|
||||
targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
|
||||
|
||||
scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
||||
scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
||||
|
||||
var w, h int
|
||||
|
||||
if scaleWidth < scaleHeight {
|
||||
w = targetWidth
|
||||
h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
||||
} else {
|
||||
w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
||||
h = targetHeight
|
||||
}
|
||||
|
||||
return image.Point{w, h}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
||||
possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
|
||||
possibleCanvasSizes := []image.Point{}
|
||||
for _, pta := range possibleTileArrangements {
|
||||
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
||||
}
|
||||
|
||||
scales := []float64{}
|
||||
|
||||
for _, pcs := range possibleCanvasSizes {
|
||||
scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
|
||||
scaleWidth := float64(pcs.X) / float64(imageSize.X)
|
||||
|
||||
if scaleWidth > scaleHeight {
|
||||
scales = append(scales, scaleHeight)
|
||||
} else {
|
||||
scales = append(scales, scaleWidth)
|
||||
}
|
||||
}
|
||||
|
||||
var minUpscale float64
|
||||
var maxDownscale float64
|
||||
var upscale bool
|
||||
|
||||
for _, s := range scales {
|
||||
if s > 1.0 {
|
||||
upscale = true
|
||||
if minUpscale == 0 {
|
||||
minUpscale = s
|
||||
} else {
|
||||
minUpscale = math.Min(minUpscale, s)
|
||||
}
|
||||
} else {
|
||||
maxDownscale = math.Max(maxDownscale, s)
|
||||
}
|
||||
}
|
||||
|
||||
selectedScale := maxDownscale
|
||||
if upscale {
|
||||
selectedScale = minUpscale
|
||||
}
|
||||
|
||||
var selectedCanvas image.Point
|
||||
for n, pcs := range possibleCanvasSizes {
|
||||
if scales[n] == selectedScale {
|
||||
// choose the smallest possible canvas
|
||||
if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
|
||||
selectedCanvas = pcs
|
||||
} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
|
||||
selectedCanvas = pcs
|
||||
}
|
||||
}
|
||||
}
|
||||
return selectedCanvas
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
||||
b := img.Bounds()
|
||||
width := b.Max.X - b.Min.X
|
||||
height := b.Max.Y - b.Min.Y
|
||||
tileHeight := height / numTilesSize.Y
|
||||
tileWidth := width / numTilesSize.X
|
||||
|
||||
images := []image.Image{}
|
||||
|
||||
for h := range numTilesSize.Y {
|
||||
for w := range numTilesSize.X {
|
||||
rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
||||
images = append(images, img.(interface {
|
||||
SubImage(image.Rectangle) image.Image
|
||||
}).SubImage(rect))
|
||||
}
|
||||
}
|
||||
|
||||
return images
|
||||
}
|
||||
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
//
|
||||
// remove the "alpha" channel by drawing over a prefilled image
|
||||
//
|
||||
//nolint:unused
|
||||
func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
|
||||
dst := image.NewRGBA(img.Bounds())
|
||||
|
||||
white := color.RGBA{255, 255, 255, 255}
|
||||
draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
||||
draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
||||
b := img.Bounds()
|
||||
tileSize := outputSize.Y
|
||||
|
||||
canvasSize := p.getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
|
||||
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
||||
newSize := p.getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
||||
|
||||
// scaling choices:
|
||||
// NearestNeighbor fast, blocky output
|
||||
// ApproxBiLinear fast, medium quality
|
||||
// BiLinear slow, high quality
|
||||
// CatmullRom very slow, very high quality
|
||||
draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
|
||||
|
||||
return dst, aspectRatio
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
||||
paddedSize := image.Point{
|
||||
X: outputSize.X * aspectRatio.X,
|
||||
Y: outputSize.Y * aspectRatio.Y,
|
||||
}
|
||||
|
||||
dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
|
||||
draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
|
||||
|
||||
return dst
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
||||
subImages := p.splitToTiles(img, aspectRatio)
|
||||
|
||||
var pixelVals []float32
|
||||
|
||||
for _, subImg := range subImages {
|
||||
bounds := subImg.Bounds()
|
||||
var rVals, gVals, bVals []float32
|
||||
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
||||
for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
||||
c := subImg.At(x, y)
|
||||
r, g, b, _ := c.RGBA()
|
||||
rVal := float32(r>>8) / 255.0
|
||||
gVal := float32(g>>8) / 255.0
|
||||
bVal := float32(b>>8) / 255.0
|
||||
|
||||
rVal = (rVal - mean[0]) / std[0]
|
||||
gVal = (gVal - mean[1]) / std[1]
|
||||
bVal = (bVal - mean[2]) / std[2]
|
||||
|
||||
rVals = append(rVals, rVal)
|
||||
gVals = append(gVals, gVal)
|
||||
bVals = append(bVals, bVal)
|
||||
}
|
||||
}
|
||||
pixelVals = append(pixelVals, rVals...)
|
||||
pixelVals = append(pixelVals, gVals...)
|
||||
pixelVals = append(pixelVals, bVals...)
|
||||
}
|
||||
|
||||
return pixelVals
|
||||
}
|
||||
|
||||
func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
|
||||
outputSize := image.Point{p.imageSize, p.imageSize}
|
||||
|
||||
// clip values
|
||||
mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
||||
std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
||||
|
||||
newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
|
||||
newImage = p.pad(newImage, outputSize, aspectRatio)
|
||||
|
||||
data := p.pack(newImage, aspectRatio, mean, std)
|
||||
aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
|
||||
return data, aspectRatioIndex, nil
|
||||
}
|
25
model/mllama/process_text.go
Normal file
25
model/mllama/process_text.go
Normal file
@ -0,0 +1,25 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
type TextProcessor struct {
|
||||
model.BytePairEncoding
|
||||
}
|
||||
|
||||
func newTextProcessor(c ml.Config) TextProcessor {
|
||||
return TextProcessor{
|
||||
BytePairEncoding: model.BytePairEncoding{
|
||||
Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
|
||||
Vocabulary: &model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Uints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
BOS: c.Uint("tokenizer.ggml.bos_token_id"),
|
||||
EOS: c.Uint("tokenizer.ggml.eos_token_id"),
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
82
model/mllama/process_text_test.go
Normal file
82
model/mllama/process_text_test.go
Normal file
@ -0,0 +1,82 @@
|
||||
package mllama
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/model"
|
||||
)
|
||||
|
||||
func TestProcessText(t *testing.T) {
|
||||
ours, err := model.New(filepath.Join("testdata", "model.bin"))
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
t.Skip("no model.bin")
|
||||
} else if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
t.Run("decode", func(t *testing.T) {
|
||||
f, err := os.Open(filepath.Join("testdata", "theirs.json"))
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
t.Skip("no theirs.json")
|
||||
} else if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var theirs [][]byte
|
||||
if err := json.NewDecoder(f).Decode(&theirs); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for id := range theirs {
|
||||
ids := []int32{int32(id)}
|
||||
s, err := ours.(model.TextProcessor).Decode(ids)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(string(theirs[id]), s); diff != "" {
|
||||
t.Errorf("%d no match (-theirs +ours):\n%s", id, diff)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("encode", func(t *testing.T) {
|
||||
f, err := os.Open(filepath.Join("..", "testdata", "inputs.json"))
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
t.Skip("no inputs.json")
|
||||
} else if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var inputs []struct {
|
||||
Values []byte `json:"base64"`
|
||||
IDs []int32 `json:"ids"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(f).Decode(&inputs); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for i, input := range inputs {
|
||||
t.Run(strconv.Itoa(i), func(t *testing.T) {
|
||||
ids, err := ours.(model.TextProcessor).Encode(string(input.Values))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if diff := cmp.Diff(input.IDs, ids); diff != "" {
|
||||
t.Errorf("%s: no match (-theirs +ours):\n%s", input.Values, diff)
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
1
model/mllama/testdata/model.bin
vendored
Symbolic link
1
model/mllama/testdata/model.bin
vendored
Symbolic link
@ -0,0 +1 @@
|
||||
/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf
|
1
model/mllama/testdata/theirs.json
vendored
Normal file
1
model/mllama/testdata/theirs.json
vendored
Normal file
File diff suppressed because one or more lines are too long
228
model/model.go
Normal file
228
model/model.go
Normal file
@ -0,0 +1,228 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"log/slog"
|
||||
"os"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
_ "golang.org/x/image/bmp"
|
||||
_ "golang.org/x/image/tiff"
|
||||
_ "golang.org/x/image/webp"
|
||||
|
||||
"github.com/ollama/ollama/cache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
_ "github.com/ollama/ollama/ml/backend"
|
||||
)
|
||||
|
||||
type Cache struct {
|
||||
cache.Cache
|
||||
cache.Options
|
||||
}
|
||||
|
||||
func (c Cache) Sub(i int) Cache {
|
||||
if c.Cache != nil {
|
||||
return Cache{
|
||||
Cache: c.Cache.Sub(i),
|
||||
Options: c.Options,
|
||||
}
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
func (c Cache) Put(ctx ml.Context, key, value ml.Tensor, opts cache.Options) (ml.Tensor, ml.Tensor) {
|
||||
if c.Cache != nil {
|
||||
return c.Cache.Put(ctx, key, value, opts)
|
||||
}
|
||||
|
||||
return key, value
|
||||
}
|
||||
|
||||
type Options struct {
|
||||
inputs []int32
|
||||
|
||||
Offset int
|
||||
|
||||
Images []image.Image
|
||||
|
||||
Cache
|
||||
}
|
||||
|
||||
func (opts Options) Inputs() []int32 {
|
||||
return opts.inputs[opts.Offset:]
|
||||
}
|
||||
|
||||
func (opts Options) Positions() []int32 {
|
||||
positions := make([]int32, len(opts.inputs)-opts.Offset)
|
||||
for i := range positions {
|
||||
positions[i] = int32(opts.Offset + i)
|
||||
}
|
||||
|
||||
return positions
|
||||
}
|
||||
|
||||
type OptionsFunc func(Model, *Options)
|
||||
|
||||
func WithInputIDs(ids []int32) OptionsFunc {
|
||||
return func(m Model, opts *Options) {
|
||||
opts.inputs = ids
|
||||
}
|
||||
}
|
||||
|
||||
func WithOffset(offset int) OptionsFunc {
|
||||
return func(m Model, opts *Options) {
|
||||
opts.Offset = offset
|
||||
opts.Cache.Position = offset
|
||||
}
|
||||
}
|
||||
|
||||
func WithImage(img image.Image) OptionsFunc {
|
||||
return func(m Model, opts *Options) {
|
||||
opts.Images = append(opts.Images, img)
|
||||
}
|
||||
}
|
||||
|
||||
func WithCache(c cache.Cache) OptionsFunc {
|
||||
return func(m Model, opts *Options) {
|
||||
opts.Cache = Cache{
|
||||
Cache: c,
|
||||
Options: cache.Options{
|
||||
Position: opts.Offset,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type Base struct {
|
||||
b ml.Backend
|
||||
}
|
||||
|
||||
func (m *Base) Backend() ml.Backend {
|
||||
return m.b
|
||||
}
|
||||
|
||||
func (m *Base) SetBackend(b ml.Backend) {
|
||||
m.b = b
|
||||
}
|
||||
|
||||
type Model interface {
|
||||
Forward(ml.Context, Options) (ml.Tensor, error)
|
||||
|
||||
Backend() ml.Backend
|
||||
SetBackend(ml.Backend)
|
||||
}
|
||||
|
||||
var models = make(map[string]func(ml.Config) (Model, error))
|
||||
|
||||
func Register(name string, f func(ml.Config) (Model, error)) {
|
||||
if _, ok := models[name]; ok {
|
||||
panic("model: model already registered")
|
||||
}
|
||||
|
||||
models[name] = f
|
||||
}
|
||||
|
||||
func New(s string) (Model, error) {
|
||||
r, err := os.Open(s)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
b, err := ml.NewBackend(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
arch := b.Config().Architecture()
|
||||
f, ok := models[arch]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unsupported model architecture %q", arch)
|
||||
}
|
||||
|
||||
m, err := f(b.Config())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := loadTensors(b, m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
m.SetBackend(b)
|
||||
return m, nil
|
||||
}
|
||||
|
||||
var mlTensorType = reflect.TypeOf((*ml.Tensor)(nil)).Elem()
|
||||
|
||||
func loadTensors(b ml.Backend, m any, tensorPath ...string) error {
|
||||
t := reflect.TypeOf(m)
|
||||
v := reflect.ValueOf(m)
|
||||
|
||||
if t.Kind() == reflect.Pointer {
|
||||
t = t.Elem()
|
||||
v = v.Elem()
|
||||
}
|
||||
|
||||
if t.Kind() == reflect.Interface {
|
||||
return loadTensors(b, v.Interface(), tensorPath...)
|
||||
}
|
||||
|
||||
for i := range t.NumField() {
|
||||
f := v.Field(i)
|
||||
fullTensorPath := tensorPath
|
||||
if tag := t.Field(i).Tag.Get("ggml"); tag != "" {
|
||||
tensorName, _, _ := strings.Cut(tag, ",")
|
||||
fullTensorPath = append(tensorPath, tensorName)
|
||||
}
|
||||
|
||||
if !f.CanSet() {
|
||||
continue
|
||||
}
|
||||
|
||||
if f.Kind() == reflect.Ptr && f.IsNil() {
|
||||
f.Set(reflect.New(f.Type().Elem()))
|
||||
} else if f.Kind() == reflect.Interface && f.IsNil() && f.Type().Implements(mlTensorType) {
|
||||
if tensor := b.Get(strings.Join(fullTensorPath, ".")); tensor != nil {
|
||||
f.Set(reflect.ValueOf(tensor))
|
||||
slog.Debug("loaded tensor", "kind", f.Elem().Type(), "", f.Interface())
|
||||
}
|
||||
}
|
||||
|
||||
if r := reflect.Indirect(f); r.Kind() == reflect.Struct {
|
||||
if err := loadTensors(b, f.Interface(), fullTensorPath...); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if r.Kind() == reflect.Slice {
|
||||
for i := range r.Len() {
|
||||
if err := loadTensors(b, f.Index(i).Addr().Interface(), append(fullTensorPath, strconv.Itoa(i))...); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func Forward(m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
|
||||
var opts Options
|
||||
for _, optsFunc := range optsFuncs {
|
||||
optsFunc(m, &opts)
|
||||
}
|
||||
|
||||
ctx := m.Backend().NewContext()
|
||||
t, err := m.Forward(ctx, opts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer ctx.Close()
|
||||
|
||||
return ctx.Compute(t), nil
|
||||
}
|
311
model/process_text.go
Normal file
311
model/process_text.go
Normal file
@ -0,0 +1,311 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"log/slog"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/dlclark/regexp2"
|
||||
heap "github.com/emirpasic/gods/v2/trees/binaryheap"
|
||||
)
|
||||
|
||||
type Special int32
|
||||
|
||||
const (
|
||||
SpecialBOS Special = iota
|
||||
SpecialEOS
|
||||
)
|
||||
|
||||
type TextProcessor interface {
|
||||
Encode(string) ([]int32, error)
|
||||
Decode([]int32) (string, error)
|
||||
Is(uint32, Special) bool
|
||||
}
|
||||
|
||||
type Vocabulary struct {
|
||||
Values []string
|
||||
Types []uint32
|
||||
Scores []uint32
|
||||
Merges []string
|
||||
|
||||
BOS, EOS uint32
|
||||
|
||||
specialOnce sync.Once
|
||||
special []string
|
||||
|
||||
valuesOnce sync.Once
|
||||
values map[string]int32
|
||||
|
||||
mergeOnce sync.Once
|
||||
merge map[string]int32
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Is(id uint32, special Special) bool {
|
||||
switch special {
|
||||
case SpecialBOS:
|
||||
return id == v.BOS
|
||||
case SpecialEOS:
|
||||
return id == v.EOS
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Encode(s string) int32 {
|
||||
v.valuesOnce.Do(func() {
|
||||
v.values = make(map[string]int32, len(v.Values))
|
||||
for i, value := range v.Values {
|
||||
v.values[value] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.values[s]; ok {
|
||||
return id
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Decode(id int32) string {
|
||||
return v.Values[id]
|
||||
}
|
||||
|
||||
func (v *Vocabulary) SpecialVocabulary() []string {
|
||||
v.specialOnce.Do(func() {
|
||||
for i := range v.Values {
|
||||
if v.Types[i] == 3 {
|
||||
v.special = append(v.special, v.Values[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return v.special
|
||||
}
|
||||
|
||||
func (v *Vocabulary) Merge(left, right string) int {
|
||||
v.mergeOnce.Do(func() {
|
||||
v.merge = make(map[string]int32, len(v.Merges))
|
||||
for i, merge := range v.Merges {
|
||||
v.merge[merge] = int32(i)
|
||||
}
|
||||
})
|
||||
|
||||
if id, ok := v.merge[left+" "+right]; ok {
|
||||
return int(id)
|
||||
}
|
||||
|
||||
return -1
|
||||
}
|
||||
|
||||
type BytePairEncoding struct {
|
||||
Pretokenizer string
|
||||
|
||||
*Vocabulary
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) split(s string) ([]string, error) {
|
||||
re, err := regexp2.Compile(bpe.Pretokenizer, regexp2.Unicode|regexp2.RE2)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var matches []string
|
||||
for m, _ := re.FindStringMatch(s); m != nil; m, _ = re.FindNextMatch(m) {
|
||||
matches = append(matches, m.String())
|
||||
}
|
||||
|
||||
return matches, nil
|
||||
}
|
||||
|
||||
// fragment is a string fragment and their corresponding token IDs
|
||||
type fragment struct {
|
||||
value string
|
||||
ids []int32
|
||||
}
|
||||
|
||||
// pair is a pair of runes and its rank
|
||||
type pair struct {
|
||||
a, b int
|
||||
rank int
|
||||
value string
|
||||
}
|
||||
|
||||
type merge struct {
|
||||
p, n int
|
||||
runes []rune
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
|
||||
fragments := []fragment{{value: s}}
|
||||
for _, special := range bpe.Vocabulary.SpecialVocabulary() {
|
||||
// TODO: process special tokens concurrently
|
||||
id := bpe.Vocabulary.Encode(special)
|
||||
for i := 0; i < len(fragments); i++ {
|
||||
frag := fragments[i]
|
||||
if len(frag.ids) > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var middle []fragment
|
||||
switch i := strings.Index(frag.value, special); {
|
||||
case i < 0:
|
||||
middle = append(middle, frag)
|
||||
case i > 0:
|
||||
middle = append(middle, fragment{value: frag.value[:i]})
|
||||
fallthrough
|
||||
default:
|
||||
middle = append(middle, fragment{value: special, ids: []int32{id}})
|
||||
if rest := frag.value[i+len(special):]; rest != "" {
|
||||
middle = append(middle, fragment{value: rest})
|
||||
}
|
||||
}
|
||||
|
||||
fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
|
||||
}
|
||||
}
|
||||
|
||||
ids := make([]int32, 0, len(fragments))
|
||||
for _, frag := range fragments {
|
||||
if len(frag.ids) > 0 {
|
||||
ids = append(ids, frag.ids...)
|
||||
slog.Debug("encoded", "text", frag.value, "ids", frag.ids, "special", true)
|
||||
continue
|
||||
}
|
||||
|
||||
// split fragment using pretokenizer
|
||||
splits, err := bpe.split(frag.value)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, split := range splits {
|
||||
// TODO: process splits concurrently
|
||||
var sb strings.Builder
|
||||
for _, b := range []byte(split) {
|
||||
r := rune(b)
|
||||
switch {
|
||||
case r == 0x00ad:
|
||||
r = 0x0143
|
||||
case r <= 0x0020:
|
||||
r = r + 0x0100
|
||||
case r >= 0x007e && r <= 0x00a0:
|
||||
r = r + 0x00a2
|
||||
}
|
||||
|
||||
sb.WriteRune(r)
|
||||
}
|
||||
|
||||
if id := bpe.Vocabulary.Encode(sb.String()); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
|
||||
continue
|
||||
}
|
||||
|
||||
runes := []rune(sb.String())
|
||||
merges := make([]merge, len(runes))
|
||||
for i := range runes {
|
||||
merges[i] = merge{
|
||||
p: i - 1,
|
||||
n: i + 1,
|
||||
runes: []rune{runes[i]},
|
||||
}
|
||||
}
|
||||
|
||||
pairwise := func(a, b int) *pair {
|
||||
if a < 0 || b >= len(runes) {
|
||||
return nil
|
||||
}
|
||||
|
||||
left, right := string(merges[a].runes), string(merges[b].runes)
|
||||
rank := bpe.Vocabulary.Merge(left, right)
|
||||
if rank < 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &pair{
|
||||
a: a,
|
||||
b: b,
|
||||
rank: rank,
|
||||
value: left + right,
|
||||
}
|
||||
}
|
||||
|
||||
pairs := heap.NewWith(func(i, j *pair) int {
|
||||
return cmp.Compare(i.rank, j.rank)
|
||||
})
|
||||
|
||||
for i := range len(runes) - 1 {
|
||||
if pair := pairwise(i, i+1); pair != nil {
|
||||
pairs.Push(pair)
|
||||
}
|
||||
}
|
||||
|
||||
for !pairs.Empty() {
|
||||
pair, _ := pairs.Pop()
|
||||
|
||||
left, right := merges[pair.a], merges[pair.b]
|
||||
if len(left.runes) <= 0 || len(right.runes) <= 0 ||
|
||||
string(left.runes)+string(right.runes) != pair.value {
|
||||
continue
|
||||
}
|
||||
|
||||
merges[pair.a].runes = append(left.runes, right.runes...)
|
||||
merges[pair.b].runes = nil
|
||||
|
||||
merges[pair.a].n = right.n
|
||||
if right.n < len(merges) {
|
||||
merges[right.n].p = pair.a
|
||||
}
|
||||
|
||||
if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
|
||||
pairs.Push(pair)
|
||||
}
|
||||
|
||||
if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
|
||||
pairs.Push(pair)
|
||||
}
|
||||
}
|
||||
|
||||
for _, merge := range merges {
|
||||
if len(merge.runes) > 0 {
|
||||
// TODO: handle the edge case where the rune isn't in the vocabulary
|
||||
if id := bpe.Vocabulary.Encode(string(merge.runes)); id >= 0 {
|
||||
ids = append(ids, id)
|
||||
slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ids, nil
|
||||
}
|
||||
|
||||
func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
|
||||
var sb strings.Builder
|
||||
for _, id := range ids {
|
||||
for _, r := range bpe.Vocabulary.Decode(id) {
|
||||
switch {
|
||||
case r == 0x0100:
|
||||
// this produces 0x00 aka NULL
|
||||
continue
|
||||
case r == 0x0143:
|
||||
r = 0x00ad
|
||||
case r > 0x0100 && r <= 0x0120:
|
||||
r = r - 0x0100
|
||||
case r > 0x0120 && r <= 0x0142:
|
||||
r = r - 0x00a2
|
||||
}
|
||||
|
||||
// NOTE: not using WriteRune here because it writes the UTF-8
|
||||
// encoding of the rune which is _not_ what we want
|
||||
if err := sb.WriteByte(byte(r)); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
slog.Debug("decoded", "ids", ids, "text", sb.String())
|
||||
return sb.String(), nil
|
||||
}
|
588
model/testdata/inputs.json
vendored
Normal file
588
model/testdata/inputs.json
vendored
Normal file
@ -0,0 +1,588 @@
|
||||
[
|
||||
{
|
||||
"base64": "aWVkIDQgwr0gbW9udGhz",
|
||||
"ids": [
|
||||
1142,
|
||||
220,
|
||||
19,
|
||||
220,
|
||||
27154,
|
||||
4038
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "RsO8aHJlcg==",
|
||||
"ids": [
|
||||
37,
|
||||
51853,
|
||||
261
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "",
|
||||
"ids": []
|
||||
},
|
||||
{
|
||||
"base64": "IA==",
|
||||
"ids": [
|
||||
220
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICA=",
|
||||
"ids": [
|
||||
256
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAg",
|
||||
"ids": [
|
||||
262
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CQ==",
|
||||
"ids": [
|
||||
197
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Cg==",
|
||||
"ids": [
|
||||
198
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Cgo=",
|
||||
"ids": [
|
||||
271
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CgoK",
|
||||
"ids": [
|
||||
1432
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CQo=",
|
||||
"ids": [
|
||||
1602
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8gd29ybGQ=",
|
||||
"ids": [
|
||||
9906,
|
||||
1917
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvIHdvcmxk",
|
||||
"ids": [
|
||||
22691,
|
||||
1917
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8gV29ybGQ=",
|
||||
"ids": [
|
||||
9906,
|
||||
4435
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvIFdvcmxk",
|
||||
"ids": [
|
||||
22691,
|
||||
4435
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvIFdvcmxkIQ==",
|
||||
"ids": [
|
||||
22691,
|
||||
4435,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8sIHdvcmxkIQ==",
|
||||
"ids": [
|
||||
9906,
|
||||
11,
|
||||
1917,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxvLCB3b3JsZCE=",
|
||||
"ids": [
|
||||
22691,
|
||||
11,
|
||||
1917,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IHRoaXMgaXMg8J+mmS5jcHA=",
|
||||
"ids": [
|
||||
420,
|
||||
374,
|
||||
11410,
|
||||
99,
|
||||
247,
|
||||
13,
|
||||
11055
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "dzA0OCA3dHVpamsgZHNkZmh1",
|
||||
"ids": [
|
||||
86,
|
||||
23904,
|
||||
220,
|
||||
22,
|
||||
83,
|
||||
2005,
|
||||
42908,
|
||||
11729,
|
||||
3013,
|
||||
17156
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "0L3QtdGJ0L4g0L3QsCDQkdGK0LvQs9Cw0YDRgdC60Lg=",
|
||||
"ids": [
|
||||
79862,
|
||||
102118,
|
||||
13373,
|
||||
64571,
|
||||
34694,
|
||||
3114,
|
||||
112203,
|
||||
80112
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "4Z6A4Z624Z6T4Z+L4Z6P4Z+C4Z6W4Z634Z6f4Z+B4Z6f4Z6i4Z624Z6F4Z6B4Z6b4Z6F4Z+B4Z6J",
|
||||
"ids": [
|
||||
21549,
|
||||
222,
|
||||
98629,
|
||||
241,
|
||||
45358,
|
||||
233,
|
||||
21549,
|
||||
237,
|
||||
45358,
|
||||
224,
|
||||
21549,
|
||||
244,
|
||||
21549,
|
||||
115,
|
||||
21549,
|
||||
253,
|
||||
45358,
|
||||
223,
|
||||
21549,
|
||||
253,
|
||||
21549,
|
||||
95,
|
||||
98629,
|
||||
227,
|
||||
21549,
|
||||
223,
|
||||
21549,
|
||||
249,
|
||||
21549,
|
||||
227,
|
||||
45358,
|
||||
223,
|
||||
21549,
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "8J+agCAobm9ybWFsKSDwn5i24oCN8J+Mq++4jyAobXVsdGlwbGUgZW1vamlzIGNvbmNhdGVuYXRlZCkg4pyFIChvbmx5IGVtb2ppIHRoYXQgaGFzIGl0cyBvd24gdG9rZW4p",
|
||||
"ids": [
|
||||
9468,
|
||||
248,
|
||||
222,
|
||||
320,
|
||||
8416,
|
||||
8,
|
||||
27623,
|
||||
114,
|
||||
102470,
|
||||
9468,
|
||||
234,
|
||||
104,
|
||||
31643,
|
||||
320,
|
||||
36773,
|
||||
100166,
|
||||
98634,
|
||||
8,
|
||||
26602,
|
||||
227,
|
||||
320,
|
||||
3323,
|
||||
43465,
|
||||
430,
|
||||
706,
|
||||
1202,
|
||||
1866,
|
||||
4037,
|
||||
8
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8=",
|
||||
"ids": [
|
||||
9906
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IEhlbGxv",
|
||||
"ids": [
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICBIZWxsbw==",
|
||||
"ids": [
|
||||
220,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAgSGVsbG8=",
|
||||
"ids": [
|
||||
256,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAgIEhlbGxv",
|
||||
"ids": [
|
||||
262,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICAgIEhlbGxvCiAgICBIZWxsbw==",
|
||||
"ids": [
|
||||
262,
|
||||
22691,
|
||||
198,
|
||||
262,
|
||||
22691
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ICg=",
|
||||
"ids": [
|
||||
320
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CiA9",
|
||||
"ids": [
|
||||
198,
|
||||
284
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "JyBlcmE=",
|
||||
"ids": [
|
||||
6,
|
||||
11639
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "SGVsbG8sIHknYWxsISBIb3cgYXJlIHlvdSDwn5iBID/miJHmg7PlnKhhcHBsZeW3peS9nDEzMTQxNTHlpKnvvZ4=",
|
||||
"ids": [
|
||||
9906,
|
||||
11,
|
||||
379,
|
||||
65948,
|
||||
0,
|
||||
2650,
|
||||
527,
|
||||
499,
|
||||
27623,
|
||||
223,
|
||||
949,
|
||||
37046,
|
||||
101067,
|
||||
19000,
|
||||
23182,
|
||||
102301,
|
||||
9263,
|
||||
18136,
|
||||
16,
|
||||
36827,
|
||||
21909
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "ISEhISEh",
|
||||
"ids": [
|
||||
17523,
|
||||
3001
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Mw==",
|
||||
"ids": [
|
||||
18
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzM=",
|
||||
"ids": [
|
||||
1644
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMz",
|
||||
"ids": [
|
||||
8765
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMw==",
|
||||
"ids": [
|
||||
8765,
|
||||
18
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzM=",
|
||||
"ids": [
|
||||
8765,
|
||||
1644
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMz",
|
||||
"ids": [
|
||||
8765,
|
||||
8765
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMzMw==",
|
||||
"ids": [
|
||||
8765,
|
||||
8765,
|
||||
18
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMzMzM=",
|
||||
"ids": [
|
||||
8765,
|
||||
8765,
|
||||
1644
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "MzMzMzMzMzMz",
|
||||
"ids": [
|
||||
8765,
|
||||
8765,
|
||||
8765
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "Q+G7rWEgVmnhu4d0",
|
||||
"ids": [
|
||||
34,
|
||||
91163,
|
||||
11655,
|
||||
26298,
|
||||
83
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "IGRpc2NhcmRz",
|
||||
"ids": [
|
||||
2624,
|
||||
2402
|
||||
]
|
||||
},
|
||||
{
|
||||
"base64": "CiAKCiAKCgogCSAJCSAJCiAgCiAgIAogICAgCiAgICAgCvCfmoAgKG5vcm1hbCkg8J+YtuKAjfCfjKvvuI8gKG11bHRpcGxlIGVtb2ppcyBjb25jYXRlbmF0ZWQpIOKchSDwn6aZ8J+mmSAzIDMzIDMzMyAzMzMzIDMzMzMzIDMzMzMzMyAzMzMzMzMzIDMzMzMzMzMzIDMuMyAzLi4zIDMuLi4zIOGegOGetuGek+Gfi+Gej+GfguGeluGet+Gen+GfgeGen+GeouGetuGehfCfmIEgP+aIkeaDs+WcqGFwcGxl5bel5L2cMTMxNDE1MeWkqe+9niAtLS0tLS09PT09PT09INC90LXRidC+INC90LAg0JHRitC70LPQsNGA0YHQutC4ICcnJycnJ2BgYGBgYGAiIiIiLi4uLi4uISEhISEhPz8/Pz8/IEkndmUgYmVlbiAndG9sZCBoZSdzIHRoZXJlLCAnUkUgeW91IHN1cmU/ICdNIG5vdCBzdXJlIEknbGwgbWFrZSBpdCwgJ0QgeW91IGxpa2Ugc29tZSB0ZWE/IFdlJ1ZlIGEnbEw=",
|
||||
"ids": [
|
||||
198,
|
||||
4815,
|
||||
15073,
|
||||
66597,
|
||||
8004,
|
||||
1602,
|
||||
2355,
|
||||
79772,
|
||||
11187,
|
||||
9468,
|
||||
248,
|
||||
222,
|
||||
320,
|
||||
8416,
|
||||
8,
|
||||
27623,
|
||||
114,
|
||||
102470,
|
||||
9468,
|
||||
234,
|
||||
104,
|
||||
31643,
|
||||
320,
|
||||
36773,
|
||||
100166,
|
||||
98634,
|
||||
8,
|
||||
26602,
|
||||
227,
|
||||
11410,
|
||||
99,
|
||||
247,
|
||||
9468,
|
||||
99,
|
||||
247,
|
||||
220,
|
||||
18,
|
||||
220,
|
||||
1644,
|
||||
220,
|
||||
8765,
|
||||
220,
|
||||
8765,
|
||||
18,
|
||||
220,
|
||||
8765,
|
||||
1644,
|
||||
220,
|
||||
8765,
|
||||
8765,
|
||||
220,
|
||||
8765,
|
||||
8765,
|
||||
18,
|
||||
220,
|
||||
8765,
|
||||
8765,
|
||||
1644,
|
||||
220,
|
||||
18,
|
||||
13,
|
||||
18,
|
||||
220,
|
||||
18,
|
||||
497,
|
||||
18,
|
||||
220,
|
||||
18,
|
||||
1131,
|
||||
18,
|
||||
220,
|
||||
21549,
|
||||
222,
|
||||
98629,
|
||||
241,
|
||||
45358,
|
||||
233,
|
||||
21549,
|
||||
237,
|
||||
45358,
|
||||
224,
|
||||
21549,
|
||||
244,
|
||||
21549,
|
||||
115,
|
||||
21549,
|
||||
253,
|
||||
45358,
|
||||
223,
|
||||
21549,
|
||||
253,
|
||||
21549,
|
||||
95,
|
||||
98629,
|
||||
227,
|
||||
76460,
|
||||
223,
|
||||
949,
|
||||
37046,
|
||||
101067,
|
||||
19000,
|
||||
23182,
|
||||
102301,
|
||||
9263,
|
||||
18136,
|
||||
16,
|
||||
36827,
|
||||
21909,
|
||||
56560,
|
||||
54337,
|
||||
19175,
|
||||
102118,
|
||||
13373,
|
||||
64571,
|
||||
34694,
|
||||
3114,
|
||||
112203,
|
||||
80112,
|
||||
3436,
|
||||
106451,
|
||||
14196,
|
||||
14196,
|
||||
74694,
|
||||
3089,
|
||||
3089,
|
||||
29249,
|
||||
17523,
|
||||
3001,
|
||||
27708,
|
||||
7801,
|
||||
358,
|
||||
3077,
|
||||
1027,
|
||||
364,
|
||||
83,
|
||||
820,
|
||||
568,
|
||||
596,
|
||||
1070,
|
||||
11,
|
||||
364,
|
||||
793,
|
||||
499,
|
||||
2771,
|
||||
30,
|
||||
364,
|
||||
44,
|
||||
539,
|
||||
2771,
|
||||
358,
|
||||
3358,
|
||||
1304,
|
||||
433,
|
||||
11,
|
||||
364,
|
||||
35,
|
||||
499,
|
||||
1093,
|
||||
1063,
|
||||
15600,
|
||||
30,
|
||||
1226,
|
||||
6,
|
||||
43712,
|
||||
264,
|
||||
64966,
|
||||
43
|
||||
]
|
||||
}
|
||||
]
|
13
sample/greedy.go
Normal file
13
sample/greedy.go
Normal file
@ -0,0 +1,13 @@
|
||||
package sample
|
||||
|
||||
import "gonum.org/v1/gonum/floats"
|
||||
|
||||
type greedy struct{}
|
||||
|
||||
func Greedy() Sampler {
|
||||
return greedy{}
|
||||
}
|
||||
|
||||
func (s greedy) Sample(t []float64) ([]float64, error) {
|
||||
return []float64{float64(floats.MaxIdx(t))}, nil
|
||||
}
|
74
sample/sample.go
Normal file
74
sample/sample.go
Normal file
@ -0,0 +1,74 @@
|
||||
package sample
|
||||
|
||||
import (
|
||||
"slices"
|
||||
|
||||
"gonum.org/v1/gonum/floats"
|
||||
"gonum.org/v1/gonum/stat/sampleuv"
|
||||
)
|
||||
|
||||
type Sampler interface {
|
||||
Sample([]float64) ([]float64, error)
|
||||
}
|
||||
|
||||
type Temperature float64
|
||||
|
||||
func (s Temperature) Sample(t []float64) ([]float64, error) {
|
||||
floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type softmax struct{}
|
||||
|
||||
func Softmax() Sampler {
|
||||
return softmax{}
|
||||
}
|
||||
|
||||
func (softmax) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type TopK int
|
||||
|
||||
func (s TopK) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type TopP float32
|
||||
|
||||
func (s TopP) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type MinP float32
|
||||
|
||||
func (s MinP) Sample(t []float64) ([]float64, error) {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
type weighed struct{}
|
||||
|
||||
func Weighed() Sampler {
|
||||
return weighed{}
|
||||
}
|
||||
|
||||
func (s weighed) Sample(t []float64) ([]float64, error) {
|
||||
w := sampleuv.NewWeighted(t, nil)
|
||||
if v, ok := w.Take(); ok {
|
||||
return []float64{float64(v)}, nil
|
||||
}
|
||||
|
||||
return t, nil
|
||||
}
|
||||
|
||||
func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
|
||||
var err error
|
||||
for _, sampler := range samplers {
|
||||
floats, err = sampler.Sample(floats)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return floats, nil
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user