wip: next ollama runner

implement llama and mllama model architectures in go using ggml (through cgo)
2025-04-15 07:09:23 +02:00 · 2024-12-02 11:45:23 -08:00 · 2024-12-02 11:45:23 -08:00 · 2c5fb24855
commit 2c5fb24855
parent 8d15a7a964
62 changed files with 75107 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -7,5 +7,14 @@ llama/**/*.cuh linguist-vendored
 llama/**/*.m linguist-vendored
 llama/**/*.metal linguist-vendored

+ml/backend/**/*.c linguist-vendored
+ml/backend/**/*.h linguist-vendored
+ml/backend/**/*.cpp linguist-vendored
+ml/backend/**/*.hpp linguist-vendored
+ml/backend/**/*.cu linguist-vendored
+ml/backend/**/*.cuh linguist-vendored
+ml/backend/**/*.m linguist-vendored
+ml/backend/**/*.metal linguist-vendored
+
 * text=auto
 *.go text eol=lf
--- a/cache/cache.go
+++ b/cache/cache.go
@ -0,0 +1,63 @@
+package cache
+
+import (
+	"github.com/ollama/ollama/ml"
+)
+
+type Options struct {
+	Position int
+}
+
+type Cache interface {
+	Sub(i int) Cache
+	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
+}
+
+type Simple struct {
+	DType    ml.DType
+	Capacity int
+
+	keys, values []ml.Tensor
+}
+
+func (c *Simple) Sub(i int) Cache {
+	if i >= len(c.keys) {
+		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
+		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
+	}
+
+	return &Simple{
+		keys:     c.keys[i : i+1],
+		values:   c.values[i : i+1],
+		Capacity: c.Capacity,
+		DType:    c.DType,
+	}
+}
+
+func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
+	if c.keys[0] == nil || c.values[0] == nil {
+		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
+		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
+	}
+
+	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
+	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
+
+	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
+
+	key = c.keys[0].View(ctx, 0,
+		int(key.Dim(0)), int(key.Stride(1)),
+		int(key.Dim(1)), int(key.Stride(2)),
+		n,
+	)
+
+	value = c.values[0].View(ctx, 0,
+		int(value.Dim(0)), int(value.Stride(1)),
+		int(value.Dim(1)), int(value.Stride(2)),
+		n,
+	)
+
+	// TODO shift context if necessary
+
+	return key, value
+}
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -0,0 +1,305 @@
+package ggml
+
+import (
+	"cmp"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"strings"
+
+	"github.com/ollama/ollama/fs/util/bufioutil"
+)
+
+type GGML struct {
+	container
+	model
+}
+
+type model interface {
+	KV() KV
+	Tensors() Tensors
+}
+
+type KV map[string]any
+
+func (kv KV) Architecture() string {
+	return cmp.Or(kv.String("general.architecture"), "unknown")
+}
+
+func (kv KV) FileType() fileType {
+	if t := kv.Uint("general.file_type"); t > 0 {
+		return fileType(t)
+	}
+
+	return fileTypeUnknown
+}
+
+func (kv KV) String(key string, defaultValue ...string) string {
+	return keyValue(kv, key, append(defaultValue, "")...)
+}
+
+func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
+	return keyValue(kv, key, append(defaultValue, 0)...)
+}
+
+func (kv KV) Float(key string, defaultValue ...float32) float32 {
+	return keyValue(kv, key, append(defaultValue, 0)...)
+}
+
+func (kv KV) Strings(key string, defaultValue ...[]string) []string {
+	r := keyValue(kv, key, &array{})
+	s := make([]string, r.size)
+	for i := range r.size {
+		s[i] = r.values[i].(string)
+	}
+
+	return s
+}
+
+func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
+	r := keyValue(kv, key, &array{})
+	s := make([]uint32, r.size)
+	for i := range r.size {
+		s[i] = uint32(r.values[i].(int32))
+	}
+
+	return s
+}
+
+func keyValue[T string | uint32 | float32 | *array](kv KV, key string, defaultValue ...T) T {
+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
+		key = kv.Architecture() + "." + key
+	}
+
+	if val, ok := kv[key]; ok {
+		return val.(T)
+	}
+
+	slog.Warn("key not found", "key", key, "default", defaultValue[0])
+	return defaultValue[0]
+}
+
+type Tensors struct {
+	Items  []*Tensor
+	Offset uint64
+}
+
+func (ts Tensors) Layers() map[string]Layer {
+	layers := make(map[string]Layer)
+	for _, t := range ts.Items {
+		parts := strings.Split(t.Name, ".")
+		if parts[0] == "blk" {
+			// join first and second part, e.g. blk.%d
+			parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
+		}
+
+		if _, ok := layers[parts[0]]; !ok {
+			layers[parts[0]] = make(Layer)
+		}
+
+		layers[parts[0]][strings.Join(parts[1:], ".")] = t
+	}
+
+	return layers
+}
+
+type Layer map[string]*Tensor
+
+func (l Layer) size() (size uint64) {
+	for _, t := range l {
+		size += t.Size()
+	}
+
+	return size
+}
+
+type Tensor struct {
+	Name   string `json:"name"`
+	Kind   uint32 `json:"kind"`
+	Offset uint64 `json:"-"`
+
+	// Shape is the number of elements in each dimension
+	Shape []uint64 `json:"shape"`
+
+	io.WriterTo `json:"-"`
+}
+
+func (t Tensor) block() (n int) {
+	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
+		return -1
+	}
+
+	return
+}
+
+func (t Tensor) blockSize() uint64 {
+	switch t.Kind {
+	case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
+		return 1
+	case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL
+		return 32
+	default: // All others
+		return 256
+	}
+}
+
+func (t Tensor) typeSize() uint64 {
+	blockSize := t.blockSize()
+
+	switch t.Kind {
+	case 0: // FP32
+		return 4
+	case 1: // FP16
+		return 2
+	case 2: // Q4_0
+		return 2 + blockSize/2
+	case 3: // Q4_1
+		return 2 + 2 + blockSize/2
+	case 6: // Q5_0
+		return 2 + 4 + blockSize/2
+	case 7: // Q5_1
+		return 2 + 2 + 4 + blockSize/2
+	case 8: // Q8_0
+		return 2 + blockSize
+	case 9: // Q8_1
+		return 4 + 4 + blockSize
+	case 10: // Q2_K
+		return blockSize/16 + blockSize/4 + 2 + 2
+	case 11: // Q3_K
+		return blockSize/8 + blockSize/4 + 12 + 2
+	case 12: // Q4_K
+		return 2 + 2 + 12 + blockSize/2
+	case 13: // Q5_K
+		return 2 + 2 + 12 + blockSize/8 + blockSize/2
+	case 14: // Q6_K
+		return blockSize/2 + blockSize/4 + blockSize/16 + 2
+	case 15: // Q8_K
+		return 2 + blockSize + 2*blockSize/16
+	case 16: // IQ2_XXS
+		return 2 + 2*blockSize/8
+	case 17: // IQ2_XS
+		return 2 + 2*blockSize/8 + blockSize/32
+	case 18: // IQ3_XXS
+		return 2 + blockSize/4 + blockSize/8
+	case 19: // IQ1_S
+		return 2 + blockSize/8 + blockSize/16
+	case 20: // IQ4_NL
+		return 2 + blockSize/2
+	case 21: // IQ3_S
+		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
+	case 22: // IQ2_S
+		return 2 + blockSize/4 + blockSize/16
+	case 23: // IQ4_XS
+		return 2 + 2 + blockSize/2 + blockSize/64
+	case 24: // I8
+		return 1
+	case 25: // I16
+		return 2
+	case 26: // I32
+		return 4
+	case 27: // I64
+		return 8
+	case 28: // F64
+		return 8
+	case 29: // IQ1_M
+		return blockSize/8 + blockSize/16 + blockSize/32
+	default:
+		return 0
+	}
+}
+
+func (t Tensor) parameters() uint64 {
+	var count uint64 = 1
+	for _, n := range t.Shape {
+		count *= n
+	}
+	return count
+}
+
+func (t Tensor) Size() uint64 {
+	return t.parameters() * t.typeSize() / t.blockSize()
+}
+
+type container interface {
+	Name() string
+	Decode(io.ReadSeeker) (model, error)
+}
+
+const (
+	// Magic constant for `ggml` files (unversioned).
+	FILE_MAGIC_GGML = 0x67676d6c
+	// Magic constant for `ggml` files (versioned, ggmf).
+	FILE_MAGIC_GGMF = 0x67676d66
+	// Magic constant for `ggml` files (versioned, ggjt).
+	FILE_MAGIC_GGJT = 0x67676a74
+	// Magic constant for `ggla` files (LoRA adapter).
+	FILE_MAGIC_GGLA = 0x67676C61
+	// Magic constant for `gguf` files (versioned, gguf)
+	FILE_MAGIC_GGUF_LE = 0x46554747
+	FILE_MAGIC_GGUF_BE = 0x47475546
+)
+
+var ErrUnsupportedFormat = errors.New("unsupported model format")
+
+func DetectGGMLType(b []byte) string {
+	switch binary.LittleEndian.Uint32(b[:4]) {
+	case FILE_MAGIC_GGML:
+		return "ggml"
+	case FILE_MAGIC_GGMF:
+		return "ggmf"
+	case FILE_MAGIC_GGJT:
+		return "ggjt"
+	case FILE_MAGIC_GGLA:
+		return "ggla"
+	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
+		return "gguf"
+	default:
+		return ""
+	}
+}
+
+// DecodeGGML decodes a GGML model from the given reader.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+	if maxArraySize == 0 {
+		maxArraySize = 1024
+	}
+
+	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
+
+	var magic uint32
+	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
+		return nil, 0, err
+	}
+
+	var c container
+	switch magic {
+	case FILE_MAGIC_GGUF_LE:
+		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
+	case FILE_MAGIC_GGUF_BE:
+		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
+	default:
+		return nil, 0, errors.New("invalid file magic")
+	}
+
+	model, err := c.Decode(rs)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	offset, err := rs.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	// final model type
+	return &GGML{
+		container: c,
+		model:     model,
+	}, offset, nil
+}
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@ -0,0 +1,661 @@
+package ggml
+
+import (
+	"bytes"
+	"cmp"
+	"encoding/binary"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"maps"
+	"slices"
+	"strings"
+)
+
+type containerGGUF struct {
+	ByteOrder binary.ByteOrder
+
+	Version uint32
+
+	V1 struct {
+		NumTensor uint32
+		NumKV     uint32
+	}
+
+	V2 struct {
+		NumTensor uint64
+		NumKV     uint64
+	}
+
+	V3 struct {
+		NumTensor uint64
+		NumKV     uint64
+	}
+
+	maxArraySize int
+}
+
+func (c *containerGGUF) canCollectArray(size int) bool {
+	return c.maxArraySize < 0 || size <= c.maxArraySize
+}
+
+func (c *containerGGUF) Name() string {
+	return "gguf"
+}
+
+func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
+	if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil {
+		return nil, err
+	}
+
+	var err error
+	switch c.Version {
+	case 1:
+		err = binary.Read(rs, c.ByteOrder, &c.V1)
+	case 2:
+		err = binary.Read(rs, c.ByteOrder, &c.V2)
+	default:
+		err = binary.Read(rs, c.ByteOrder, &c.V3)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	model := newGGUF(c)
+	if err := model.Decode(rs); err != nil {
+		return nil, err
+	}
+
+	return model, nil
+}
+
+const (
+	ggufTypeUint8 uint32 = iota
+	ggufTypeInt8
+	ggufTypeUint16
+	ggufTypeInt16
+	ggufTypeUint32
+	ggufTypeInt32
+	ggufTypeFloat32
+	ggufTypeBool
+	ggufTypeString
+	ggufTypeArray
+	ggufTypeUint64
+	ggufTypeInt64
+	ggufTypeFloat64
+)
+
+type gguf struct {
+	*containerGGUF
+
+	kv      KV
+	tensors []*Tensor
+
+	parameters   uint64
+	tensorOffset uint64
+
+	scratch [16 << 10]byte
+}
+
+func newGGUF(container *containerGGUF) *gguf {
+	return &gguf{
+		containerGGUF: container,
+		kv:            make(KV),
+	}
+}
+
+func (llm *gguf) KV() KV {
+	return llm.kv
+}
+
+func (llm *gguf) Tensors() Tensors {
+	return Tensors{
+		Items:  llm.tensors,
+		Offset: llm.tensorOffset,
+	}
+}
+
+func (llm *gguf) numTensor() uint64 {
+	switch llm.Version {
+	case 1:
+		return uint64(llm.V1.NumTensor)
+	case 2:
+		return llm.V2.NumTensor
+	default:
+		return llm.V3.NumTensor
+	}
+}
+
+func (llm *gguf) numKV() uint64 {
+	switch llm.Version {
+	case 1:
+		return uint64(llm.V1.NumKV)
+	case 2:
+		return llm.V2.NumKV
+	default:
+		return llm.V3.NumKV
+	}
+}
+
+func (llm *gguf) Decode(rs io.ReadSeeker) error {
+	// decode key-values
+	for i := 0; uint64(i) < llm.numKV(); i++ {
+		k, err := readGGUFString(llm, rs)
+		if err != nil {
+			return err
+		}
+
+		t, err := readGGUF[uint32](llm, rs)
+		if err != nil {
+			return err
+		}
+
+		var v any
+		switch t {
+		case ggufTypeUint8:
+			v, err = readGGUF[uint8](llm, rs)
+		case ggufTypeInt8:
+			v, err = readGGUF[int8](llm, rs)
+		case ggufTypeUint16:
+			v, err = readGGUF[uint16](llm, rs)
+		case ggufTypeInt16:
+			v, err = readGGUF[int16](llm, rs)
+		case ggufTypeUint32:
+			v, err = readGGUF[uint32](llm, rs)
+		case ggufTypeInt32:
+			v, err = readGGUF[int32](llm, rs)
+		case ggufTypeUint64:
+			v, err = readGGUF[uint64](llm, rs)
+		case ggufTypeInt64:
+			v, err = readGGUF[int64](llm, rs)
+		case ggufTypeFloat32:
+			v, err = readGGUF[float32](llm, rs)
+		case ggufTypeFloat64:
+			v, err = readGGUF[float64](llm, rs)
+		case ggufTypeBool:
+			v, err = readGGUF[bool](llm, rs)
+		case ggufTypeString:
+			v, err = readGGUFString(llm, rs)
+		case ggufTypeArray:
+			v, err = readGGUFArray(llm, rs)
+		default:
+			return fmt.Errorf("invalid type: %d", t)
+		}
+
+		if err != nil {
+			return err
+		}
+
+		llm.kv[k] = v
+	}
+
+	// decode tensors
+	for range llm.numTensor() {
+		name, err := readGGUFString(llm, rs)
+		if err != nil {
+			return fmt.Errorf("failed to read tensor name: %w", err)
+		}
+
+		// dims is the number of dimensions in the tensor
+		dims, err := readGGUF[uint32](llm, rs)
+		if err != nil {
+			return fmt.Errorf("failed to read tensor dimensions: %w", err)
+		}
+
+		shape := make([]uint64, dims)
+		for i := 0; uint32(i) < dims; i++ {
+			shape[i], err = readGGUF[uint64](llm, rs)
+			if err != nil {
+				return fmt.Errorf("failed to read tensor shape: %w", err)
+			}
+		}
+
+		kind, err := readGGUF[uint32](llm, rs)
+		if err != nil {
+			return fmt.Errorf("failed to read tensor kind: %w", err)
+		}
+
+		offset, err := readGGUF[uint64](llm, rs)
+		if err != nil {
+			return fmt.Errorf("failed to read tensor offset: %w", err)
+		}
+
+		tensor := Tensor{
+			Name:   name,
+			Kind:   kind,
+			Offset: offset,
+			Shape:  shape[:],
+		}
+
+		llm.tensors = append(llm.tensors, &tensor)
+		llm.parameters += tensor.parameters()
+	}
+
+	// patch KV with parameter count
+	llm.kv["general.parameter_count"] = llm.parameters
+
+	alignment, ok := llm.kv["general.alignment"].(uint32)
+	if !ok {
+		alignment = 32
+	}
+
+	offset, err := rs.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+
+	padding := ggufPadding(offset, int64(alignment))
+	llm.tensorOffset = uint64(offset + padding)
+
+	for _, tensor := range llm.tensors {
+		offset, err := rs.Seek(0, io.SeekCurrent)
+		if err != nil {
+			return fmt.Errorf("failed to get current offset: %w", err)
+		}
+
+		padding := ggufPadding(offset, int64(alignment))
+		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
+			return fmt.Errorf("failed to seek to init padding: %w", err)
+		}
+
+		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
+			return fmt.Errorf("failed to seek to tensor: %w", err)
+		}
+	}
+
+	return nil
+}
+
+func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
+	var t T
+	err := binary.Read(r, llm.ByteOrder, &t)
+	return t, err
+}
+
+func writeGGUF[V any](w io.Writer, t uint32, v V) error {
+	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
+		return err
+	}
+
+	return binary.Write(w, binary.LittleEndian, v)
+}
+
+func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
+	var length uint64
+	if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
+		return "", err
+	}
+
+	var b bytes.Buffer
+	if _, err := io.CopyN(&b, r, int64(length)); err != nil {
+		return "", err
+	}
+
+	// gguf v1 strings are null-terminated
+	b.Truncate(b.Len() - 1)
+
+	return b.String(), nil
+}
+
+func discardGGUFString(llm *gguf, r io.Reader) error {
+	buf := llm.scratch[:8]
+	_, err := io.ReadFull(r, buf)
+	if err != nil {
+		return err
+	}
+
+	size := int(llm.ByteOrder.Uint64(buf))
+	for size > 0 {
+		n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
+		if err != nil {
+			return err
+		}
+		size -= n
+	}
+	return nil
+}
+
+func readGGUFString(llm *gguf, r io.Reader) (string, error) {
+	if llm.Version == 1 {
+		return readGGUFV1String(llm, r)
+	}
+
+	buf := llm.scratch[:8]
+	_, err := io.ReadFull(r, buf)
+	if err != nil {
+		return "", err
+	}
+
+	length := int(llm.ByteOrder.Uint64(buf))
+	if length > len(llm.scratch) {
+		buf = make([]byte, length)
+	} else {
+		buf = llm.scratch[:length]
+	}
+	clear(buf)
+
+	_, err = io.ReadFull(r, buf)
+	if err != nil {
+		return "", err
+	}
+	return string(buf), nil
+}
+
+func writeGGUFString(w io.Writer, s string) error {
+	if err := binary.Write(w, binary.LittleEndian, ggufTypeString); err != nil {
+		return err
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
+		return err
+	}
+
+	_, err := io.Copy(w, strings.NewReader(s))
+	return err
+}
+
+type array struct {
+	size   int
+	values []any
+}
+
+func (a *array) MarshalJSON() ([]byte, error) {
+	return json.Marshal(a.values)
+}
+
+func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
+	t, err := readGGUF[uint32](llm, r)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := readGGUF[uint32](llm, r)
+	if err != nil {
+		return nil, err
+	}
+
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, 0, int(n))
+	}
+
+	for i := range n {
+		var e any
+		switch t {
+		case ggufTypeUint8:
+			e, err = readGGUF[uint8](llm, r)
+		case ggufTypeInt8:
+			e, err = readGGUF[int8](llm, r)
+		case ggufTypeUint16:
+			e, err = readGGUF[uint16](llm, r)
+		case ggufTypeInt16:
+			e, err = readGGUF[int16](llm, r)
+		case ggufTypeUint32:
+			e, err = readGGUF[uint32](llm, r)
+		case ggufTypeInt32:
+			e, err = readGGUF[int32](llm, r)
+		case ggufTypeUint64:
+			e, err = readGGUF[uint64](llm, r)
+		case ggufTypeInt64:
+			e, err = readGGUF[int64](llm, r)
+		case ggufTypeFloat32:
+			e, err = readGGUF[float32](llm, r)
+		case ggufTypeFloat64:
+			e, err = readGGUF[float64](llm, r)
+		case ggufTypeBool:
+			e, err = readGGUF[bool](llm, r)
+		case ggufTypeString:
+			e, err = readGGUFV1String(llm, r)
+		default:
+			return nil, fmt.Errorf("invalid array type: %d", t)
+		}
+		if err != nil {
+			return nil, err
+		}
+
+		if a.values != nil {
+			a.values[i] = e
+		}
+	}
+
+	return a, nil
+}
+
+func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
+	if llm.Version == 1 {
+		return readGGUFV1Array(llm, r)
+	}
+
+	t, err := readGGUF[uint32](llm, r)
+	if err != nil {
+		return nil, err
+	}
+
+	n, err := readGGUF[uint64](llm, r)
+	if err != nil {
+		return nil, err
+	}
+
+	a := &array{size: int(n)}
+	if llm.canCollectArray(int(n)) {
+		a.values = make([]any, int(n))
+	}
+
+	for i := range n {
+		var e any
+		switch t {
+		case ggufTypeUint8:
+			e, err = readGGUF[uint8](llm, r)
+		case ggufTypeInt8:
+			e, err = readGGUF[int8](llm, r)
+		case ggufTypeUint16:
+			e, err = readGGUF[uint16](llm, r)
+		case ggufTypeInt16:
+			e, err = readGGUF[int16](llm, r)
+		case ggufTypeUint32:
+			e, err = readGGUF[uint32](llm, r)
+		case ggufTypeInt32:
+			e, err = readGGUF[int32](llm, r)
+		case ggufTypeUint64:
+			e, err = readGGUF[uint64](llm, r)
+		case ggufTypeInt64:
+			e, err = readGGUF[int64](llm, r)
+		case ggufTypeFloat32:
+			e, err = readGGUF[float32](llm, r)
+		case ggufTypeFloat64:
+			e, err = readGGUF[float64](llm, r)
+		case ggufTypeBool:
+			e, err = readGGUF[bool](llm, r)
+		case ggufTypeString:
+			if a.values != nil {
+				e, err = readGGUFString(llm, r)
+			} else {
+				err = discardGGUFString(llm, r)
+			}
+		default:
+			return nil, fmt.Errorf("invalid array type: %d", t)
+		}
+		if err != nil {
+			return nil, err
+		}
+
+		if a.values != nil {
+			a.values[i] = e
+		}
+	}
+
+	return a, nil
+}
+
+// writeGGUFArray writes a slice s of type E to the write with a gguf type of t
+func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
+	if err := binary.Write(w, binary.LittleEndian, ggufTypeArray); err != nil {
+		return err
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
+		return err
+	}
+
+	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
+		return err
+	}
+
+	return binary.Write(w, binary.LittleEndian, s)
+}
+
+func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
+	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
+		return err
+	}
+
+	keys := slices.Collect(maps.Keys(kv))
+	slices.Sort(keys)
+
+	for _, key := range keys {
+		if err := ggufWriteKV(ws, key, kv[key]); err != nil {
+			return err
+		}
+	}
+
+	slices.SortStableFunc(ts, func(a, b Tensor) int {
+		if i, j := a.block(), b.block(); i < 0 && j > 0 {
+			return 1
+		} else if i > 0 && j < 0 {
+			return -1
+		} else {
+			return cmp.Compare(i, j)
+		}
+	})
+
+	var s uint64
+	for _, t := range ts {
+		t.Offset = s
+		if err := ggufWriteTensorInfo(ws, t); err != nil {
+			return err
+		}
+		s += t.Size()
+	}
+
+	var alignment int64 = 32
+	for _, t := range ts {
+		if err := ggufWriteTensor(ws, t, alignment); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
+	slog.Debug(k, "type", fmt.Sprintf("%T", v))
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(k))); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, []byte(k)); err != nil {
+		return err
+	}
+
+	var err error
+	switch v := v.(type) {
+	case uint32:
+		err = writeGGUF(ws, ggufTypeUint32, v)
+	case float32:
+		err = writeGGUF(ws, ggufTypeFloat32, v)
+	case bool:
+		err = writeGGUF(ws, ggufTypeBool, v)
+	case string:
+		err = writeGGUFString(ws, v)
+	case []int32:
+		err = writeGGUFArray(ws, ggufTypeInt32, v)
+	case []uint32:
+		err = writeGGUFArray(ws, ggufTypeUint32, v)
+	case []float32:
+		err = writeGGUFArray(ws, ggufTypeFloat32, v)
+	case []string:
+		if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
+			return err
+		}
+
+		if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
+			return err
+		}
+
+		if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
+			return err
+		}
+
+		for _, e := range v {
+			if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
+				return err
+			}
+
+			if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
+				return err
+			}
+		}
+	default:
+		return fmt.Errorf("improper type for '%s'", k)
+	}
+
+	return err
+}
+
+func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
+	slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
+	if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Shape))); err != nil {
+		return err
+	}
+
+	for i := range len(t.Shape) {
+		if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil {
+			return err
+		}
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil {
+		return err
+	}
+
+	return binary.Write(ws, binary.LittleEndian, t.Offset)
+}
+
+func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
+	offset, err := ws.Seek(0, io.SeekCurrent)
+	if err != nil {
+		return err
+	}
+
+	if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
+		return err
+	}
+
+	_, err = t.WriteTo(ws)
+	return err
+}
+
+func ggufPadding(offset, align int64) int64 {
+	return (align - offset%align) % align
+}
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@ -0,0 +1,180 @@
+package ggml
+
+import "fmt"
+
+type fileType uint32
+
+const (
+	fileTypeF32 fileType = iota
+	fileTypeF16
+	fileTypeQ4_0
+	fileTypeQ4_1
+	fileTypeQ4_1_F16
+	fileTypeQ4_2 // unused
+	fileTypeQ4_3 // unused
+	fileTypeQ8_0
+	fileTypeQ5_0
+	fileTypeQ5_1
+	fileTypeQ2_K
+	fileTypeQ3_K_S
+	fileTypeQ3_K_M
+	fileTypeQ3_K_L
+	fileTypeQ4_K_S
+	fileTypeQ4_K_M
+	fileTypeQ5_K_S
+	fileTypeQ5_K_M
+	fileTypeQ6_K
+	fileTypeIQ2_XXS
+	fileTypeIQ2_XS
+	fileTypeQ2_K_S
+	fileTypeIQ3_XS
+	fileTypeIQ3_XXS
+	fileTypeIQ1_S
+	fileTypeIQ4_NL
+	fileTypeIQ3_S
+	fileTypeIQ2_S
+	fileTypeIQ4_XS
+	fileTypeIQ2_M
+	fileTypeIQ1_M
+	fileTypeBF16
+
+	fileTypeUnknown
+)
+
+func ParseFileType(s string) (fileType, error) {
+	switch s {
+	case "F32":
+		return fileTypeF32, nil
+	case "F16":
+		return fileTypeF16, nil
+	case "Q4_0":
+		return fileTypeQ4_0, nil
+	case "Q4_1":
+		return fileTypeQ4_1, nil
+	case "Q4_1_F16":
+		return fileTypeQ4_1_F16, nil
+	case "Q8_0":
+		return fileTypeQ8_0, nil
+	case "Q5_0":
+		return fileTypeQ5_0, nil
+	case "Q5_1":
+		return fileTypeQ5_1, nil
+	case "Q2_K":
+		return fileTypeQ2_K, nil
+	case "Q3_K_S":
+		return fileTypeQ3_K_S, nil
+	case "Q3_K_M":
+		return fileTypeQ3_K_M, nil
+	case "Q3_K_L":
+		return fileTypeQ3_K_L, nil
+	case "Q4_K_S":
+		return fileTypeQ4_K_S, nil
+	case "Q4_K_M":
+		return fileTypeQ4_K_M, nil
+	case "Q5_K_S":
+		return fileTypeQ5_K_S, nil
+	case "Q5_K_M":
+		return fileTypeQ5_K_M, nil
+	case "Q6_K":
+		return fileTypeQ6_K, nil
+	case "IQ2_XXS":
+		return fileTypeIQ2_XXS, nil
+	case "IQ2_XS":
+		return fileTypeIQ2_XS, nil
+	case "Q2_K_S":
+		return fileTypeQ2_K_S, nil
+	case "IQ3_XS":
+		return fileTypeIQ3_XS, nil
+	case "IQ3_XXS":
+		return fileTypeIQ3_XXS, nil
+	case "IQ1_S":
+		return fileTypeIQ1_S, nil
+	case "IQ4_NL":
+		return fileTypeIQ4_NL, nil
+	case "IQ3_S":
+		return fileTypeIQ3_S, nil
+	case "IQ2_S":
+		return fileTypeIQ2_S, nil
+	case "IQ4_XS":
+		return fileTypeIQ4_XS, nil
+	case "IQ2_M":
+		return fileTypeIQ2_M, nil
+	case "IQ1_M":
+		return fileTypeIQ1_M, nil
+	case "BF16":
+		return fileTypeBF16, nil
+	default:
+		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
+	}
+}
+
+func (t fileType) String() string {
+	switch t {
+	case fileTypeF32:
+		return "F32"
+	case fileTypeF16:
+		return "F16"
+	case fileTypeQ4_0:
+		return "Q4_0"
+	case fileTypeQ4_1:
+		return "Q4_1"
+	case fileTypeQ4_1_F16:
+		return "Q4_1_F16"
+	case fileTypeQ8_0:
+		return "Q8_0"
+	case fileTypeQ5_0:
+		return "Q5_0"
+	case fileTypeQ5_1:
+		return "Q5_1"
+	case fileTypeQ2_K:
+		return "Q2_K"
+	case fileTypeQ3_K_S:
+		return "Q3_K_S"
+	case fileTypeQ3_K_M:
+		return "Q3_K_M"
+	case fileTypeQ3_K_L:
+		return "Q3_K_L"
+	case fileTypeQ4_K_S:
+		return "Q4_K_S"
+	case fileTypeQ4_K_M:
+		return "Q4_K_M"
+	case fileTypeQ5_K_S:
+		return "Q5_K_S"
+	case fileTypeQ5_K_M:
+		return "Q5_K_M"
+	case fileTypeQ6_K:
+		return "Q6_K"
+	case fileTypeIQ2_XXS:
+		return "IQ2_XXS"
+	case fileTypeIQ2_XS:
+		return "IQ2_XS"
+	case fileTypeQ2_K_S:
+		return "Q2_K_S"
+	case fileTypeIQ3_XS:
+		return "IQ3_XS"
+	case fileTypeIQ3_XXS:
+		return "IQ3_XXS"
+	case fileTypeIQ1_S:
+		return "IQ1_S"
+	case fileTypeIQ4_NL:
+		return "IQ4_NL"
+	case fileTypeIQ3_S:
+		return "IQ3_S"
+	case fileTypeIQ2_S:
+		return "IQ2_S"
+	case fileTypeIQ4_XS:
+		return "IQ4_XS"
+	case fileTypeIQ2_M:
+		return "IQ2_M"
+	case fileTypeIQ1_M:
+		return "IQ1_M"
+	case fileTypeBF16:
+		return "BF16"
+	default:
+		return "unknown"
+	}
+}
+
+func (t fileType) Value() uint32 {
+	return uint32(t)
+}
--- a/fs/util/bufioutil/buffer_seeker.go
+++ b/fs/util/bufioutil/buffer_seeker.go
--- a/fs/util/bufioutil/buffer_seeker_test.go
+++ b/fs/util/bufioutil/buffer_seeker_test.go
--- a/go.mod
+++ b/go.mod
@ -18,11 +18,14 @@ require (
 require (
 	github.com/agnivade/levenshtein v1.1.1
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
+	github.com/dlclark/regexp2 v1.11.4
+	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
 	golang.org/x/image v0.22.0
+	gonum.org/v1/gonum v0.15.0
 )

 require (
@ -42,7 +45,6 @@ require (
 	github.com/xtgo/set v1.0.0 // indirect
 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
-	gonum.org/v1/gonum v0.15.0 // indirect
 	gorgonia.org/vecf32 v0.9.0 // indirect
 	gorgonia.org/vecf64 v0.9.0 // indirect
 )
--- a/go.sum
+++ b/go.sum
@ -42,8 +42,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
+github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
+github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
 github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc=
 github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ=
+github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
+github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -9,7 +9,7 @@ import (
 	"strings"
 	"sync"

-	"github.com/ollama/ollama/util/bufioutil"
+	"github.com/ollama/ollama/fs/util/bufioutil"
 )

 type GGML struct {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -0,0 +1,191 @@
+package ml
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"strings"
+)
+
+type Config interface {
+	Architecture() string
+	String(string, ...string) string
+	Uint(string, ...uint32) uint32
+	Float(string, ...float32) float32
+
+	Strings(string, ...[]string) []string
+	Uints(string, ...[]uint32) []uint32
+}
+
+type Backend interface {
+	Config() Config
+	Get(name string) Tensor
+	NewContext() Context
+}
+
+var backends = make(map[string]func(io.ReadSeeker) (Backend, error))
+
+func RegisterBackend(name string, f func(io.ReadSeeker) (Backend, error)) {
+	if _, ok := backends[name]; ok {
+		panic("backend: backend already registered")
+	}
+
+	backends[name] = f
+}
+
+func NewBackend(r io.ReadSeeker) (Backend, error) {
+	if backend, ok := backends["ggml"]; ok {
+		return backend(r)
+	}
+
+	return nil, fmt.Errorf("unsupported backend")
+}
+
+type Context interface {
+	Zeros(dtype DType, shape ...int) Tensor
+	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
+	FromIntSlice(s []int32, shape ...int) (Tensor, error)
+
+	Forward(Tensor)
+	Compute(Tensor) Tensor
+	Close() error
+}
+
+type Tensor interface {
+	Dim(n int) int64
+	Stride(n int) int64
+
+	Shape() []int64
+	DType() DType
+
+	Bytes() []byte
+	Floats() []float32
+
+	Add(ctx Context, t2 Tensor) Tensor
+	Mul(ctx Context, t2 Tensor) Tensor
+	Mulmat(ctx Context, t2 Tensor) Tensor
+
+	Softmax(ctx Context) Tensor
+	Norm(ctx Context, eps float32) Tensor
+	RMSNorm(ctx Context, eps float32) Tensor
+	Scale(ctx Context, s float64) Tensor
+
+	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
+	Rope(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
+
+	Tanh(ctx Context) Tensor
+	GELU(ctx Context) Tensor
+	SILU(ctx Context) Tensor
+
+	Reshape(ctx Context, shape ...int64) Tensor
+	View(ctx Context, offset int, shape ...int) Tensor
+	Permute(ctx Context, shape ...int) Tensor
+	Contiguous(ctx Context) Tensor
+
+	Pad(ctx Context, shape ...int64) Tensor
+	Unpad(ctx Context, shape ...int64) Tensor
+
+	Stack(ctx Context, dim int, s ...Tensor) Tensor
+	Concat(ctx Context, t2 Tensor, dim int) Tensor
+	Rows(ctx Context, t2 Tensor) Tensor
+	Copy(ctx Context, t2 Tensor) Tensor
+}
+
+type number interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 |
+		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
+		~float32 | ~float64 |
+		~complex64 | ~complex128
+}
+
+func mul[T number](s ...T) T {
+	p := T(1)
+	for _, v := range s {
+		p *= v
+	}
+
+	return p
+}
+
+type DumpOptions struct {
+	// Items is the number of elements to print at the beginning and end of each dimension.
+	Items int64
+
+	// Precision is the number of decimal places to print. Applies to float32 and float64.
+	Precision int
+}
+
+func Dump(t Tensor, opts ...DumpOptions) string {
+	if len(opts) < 1 {
+		opts = append(opts, DumpOptions{
+			Items:     3,
+			Precision: 4,
+		})
+	}
+
+	switch t.DType() {
+	case DTypeF32:
+		return dump[[]float32](t, opts[0])
+	case DTypeI32:
+		return dump[[]int32](t, opts[0])
+	default:
+		return "<unsupported>"
+	}
+}
+
+func dump[S ~[]E, E number](t Tensor, opts DumpOptions) string {
+	bts := t.Bytes()
+	if bts == nil {
+		return "<nil>"
+	}
+
+	s := make(S, mul(t.Shape()...))
+	if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
+		panic(err)
+	}
+
+	shape := t.Shape()
+
+	var sb strings.Builder
+	var f func([]int64, int64)
+	f = func(dims []int64, stride int64) {
+		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
+		fmt.Fprint(&sb, "[")
+		defer func() { fmt.Fprint(&sb, "]") }()
+		for i := int64(0); i < dims[0]; i++ {
+			if i >= opts.Items && i < dims[0]-opts.Items {
+				fmt.Fprint(&sb, "..., ")
+				// skip to next printable element
+				skip := dims[0] - 2*opts.Items
+				if len(dims) > 1 {
+					stride += mul(append(dims[1:], skip)...)
+					fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
+				}
+				i += skip - 1
+			} else if len(dims) > 1 {
+				f(dims[1:], stride)
+				stride += mul(dims[1:]...)
+				if i < dims[0]-1 {
+					fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
+				}
+			} else {
+				fmt.Fprint(&sb, s[stride+i])
+				if i < dims[0]-1 {
+					fmt.Fprint(&sb, ", ")
+				}
+			}
+		}
+	}
+	f(shape, 0)
+
+	return sb.String()
+}
+
+type DType int
+
+const (
+	DTypeF32 DType = iota
+	DTypeI32
+	DTypeOther
+)
--- a/ml/backend/backend.go
+++ b/ml/backend/backend.go
@ -0,0 +1,5 @@
+package backend
+
+import (
+	_ "github.com/ollama/ollama/ml/backend/ggml"
+)
--- a/ml/backend/ggml/backend.go
+++ b/ml/backend/ggml/backend.go
@ -0,0 +1,470 @@
+package ggml
+
+// #cgo CPPFLAGS: -DNDEBUG
+// #include <stdlib.h>
+// #include <stdint.h>
+// #include "ggml.h"
+// #include "ggml-backend.h"
+import "C"
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"log/slog"
+	"unsafe"
+
+	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/ml"
+)
+
+type Backend struct {
+	c  *C.struct_ggml_context
+	b  *C.struct_ggml_backend
+	bb *C.struct_ggml_backend_buffer
+
+	ggml.KV
+	ggml.Tensors
+}
+
+func New(r io.ReadSeeker) (ml.Backend, error) {
+	f, _, err := ggml.DecodeGGML(r, -1)
+	if err != nil {
+		return nil, err
+	}
+
+	slog.Info(
+		"",
+		"architecture", f.KV().Architecture(),
+		"file_type", f.KV().FileType(),
+		"name", f.KV().String("general.name"),
+		"description", f.KV().String("general.description"),
+		"num_tensors", len(f.Tensors().Items),
+		"num_key_values", len(f.KV()),
+	)
+
+	c := C.ggml_init(C.struct_ggml_init_params{
+		mem_size:   C.size_t(len(f.Tensors().Items)) * C.ggml_tensor_overhead(),
+		mem_buffer: nil,
+		no_alloc:   true,
+	})
+
+	for _, t := range f.Tensors().Items {
+		func() {
+			cname := C.CString(t.Name)
+			defer C.free(unsafe.Pointer(cname))
+
+			tt := C.ggml_new_tensor(c, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0])))
+			C.ggml_set_name(tt, cname)
+		}()
+	}
+
+	b := newBackend()
+	bb := C.ggml_backend_alloc_ctx_tensors(c, b)
+	for _, t := range f.Tensors().Items {
+		if _, err := r.Seek(int64(f.Tensors().Offset+t.Offset), io.SeekStart); err != nil {
+			return nil, err
+		}
+
+		var b bytes.Buffer
+		n, err := io.CopyN(&b, r, int64(t.Size()))
+		if err != nil {
+			return nil, err
+		}
+
+		if n != int64(t.Size()) {
+			return nil, fmt.Errorf("expected %d bytes, got %d", t.Size(), n)
+		}
+
+		func() {
+			cname := C.CString(t.Name)
+			defer C.free(unsafe.Pointer(cname))
+
+			cbytes := C.CBytes(b.Bytes())
+			defer C.free(cbytes)
+
+			C.ggml_backend_tensor_set(C.ggml_get_tensor(c, cname), cbytes, 0, C.size_t(n))
+		}()
+	}
+
+	return &Backend{c, b, bb, f.KV(), f.Tensors()}, nil
+}
+
+func init() {
+	ml.RegisterBackend("ggml", New)
+}
+
+func (b *Backend) Config() ml.Config {
+	return b.KV
+}
+
+func (b *Backend) Get(name string) ml.Tensor {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	if t := C.ggml_get_tensor(b.c, cname); t != nil {
+		return &Tensor{t}
+	}
+
+	return nil
+}
+
+func (b *Backend) NewContext() ml.Context {
+	n := max(8192, len(b.Tensors.Items)*5)
+	bts := make([]byte, C.size_t(n)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(n), false))
+	c := C.ggml_init(C.struct_ggml_init_params{
+		mem_buffer: unsafe.Pointer(&bts[0]),
+		mem_size:   C.size_t(len(bts)),
+		no_alloc:   true,
+	})
+	return &Context{
+		b: b.b,
+		c: c,
+		g: C.ggml_new_graph_custom(c, C.size_t(n), false),
+	}
+}
+
+type Context struct {
+	b *C.struct_ggml_backend
+	c *C.struct_ggml_context
+	g *C.struct_ggml_cgraph
+}
+
+func (c *Context) Forward(t ml.Tensor) {
+	C.ggml_build_forward_expand(c.g, t.(*Tensor).t)
+}
+
+func (c *Context) Compute(t ml.Tensor) ml.Tensor {
+	c.Forward(t)
+
+	a := C.ggml_gallocr_new(C.ggml_backend_get_default_buffer_type(c.b))
+	C.ggml_gallocr_alloc_graph(a, c.g)
+	slog.Debug("compute graph memory", "require", format.HumanBytes2(uint64(C.ggml_gallocr_get_buffer_size(a, 0))))
+
+	C.ggml_backend_graph_compute(c.b, c.g)
+	return &Tensor{
+		C.ggml_graph_node(c.g, C.ggml_graph_n_nodes(c.g)-1),
+	}
+}
+
+func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
+	if len(shape) < 1 || len(shape) > 4 {
+		panic("unsupported number of dimensions")
+	}
+
+	for _, dim := range shape {
+		if dim < 1 {
+			panic("invalid shape")
+		}
+	}
+
+	var t *C.struct_ggml_tensor
+	switch dtype {
+	case ml.DTypeF32:
+		t = C.ggml_new_tensor(c.c, C.GGML_TYPE_F32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
+	case ml.DTypeI32:
+		t = C.ggml_new_tensor(c.c, C.GGML_TYPE_I32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
+	default:
+		panic("unsupported dtype")
+	}
+
+	b := C.ggml_backend_alloc_buffer(c.b, C.ggml_nbytes(t))
+	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
+	C.ggml_set_f32(t, 0.)
+	return &Tensor{t}
+}
+
+func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
+	n := len(s)
+	for _, v := range shape {
+		n /= v
+	}
+
+	if n != 1 {
+		return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
+	}
+
+	t := C.ggml_new_tensor(ctx.c, dtype, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
+	b := C.ggml_backend_alloc_buffer(ctx.b, C.ggml_nbytes(t))
+	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
+	C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
+	return &Tensor{t}, nil
+}
+
+func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+	return fromSlice(c, s, shape, C.GGML_TYPE_F32)
+}
+
+func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+	return fromSlice(c, s, shape, C.GGML_TYPE_I32)
+}
+
+func (c *Context) Close() error {
+	C.ggml_free(c.c)
+	return nil
+}
+
+type Tensor struct {
+	t *C.struct_ggml_tensor
+}
+
+func (t *Tensor) LogValue() slog.Value {
+	return slog.GroupValue(
+		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
+		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
+		slog.Any("shape", t.Shape()),
+	)
+}
+
+func (t *Tensor) Dim(n int) int64 {
+	return int64(t.t.ne[n])
+}
+
+func (t *Tensor) Stride(n int) int64 {
+	return int64(t.t.nb[n])
+}
+
+func (t *Tensor) Shape() []int64 {
+	shape := make([]int64, C.ggml_n_dims(t.t))
+	for i := range shape {
+		shape[i] = t.Dim(i)
+	}
+
+	return shape
+}
+
+func (t *Tensor) Bytes() []byte {
+	if bts := C.ggml_get_data(t.t); bts != nil {
+		return C.GoBytes(bts, C.int(C.ggml_nbytes(t.t)))
+	}
+
+	return nil
+}
+
+func (t *Tensor) Floats() []float32 {
+	if s := C.ggml_get_data_f32(t.t); s != nil {
+		f32s := make([]float32, C.ggml_nelements(t.t))
+		for i, v := range unsafe.Slice(s, C.ggml_nelements(t.t)) {
+			f32s[i] = float32(v)
+		}
+
+		return f32s
+	}
+
+	return nil
+}
+
+func (t *Tensor) DType() ml.DType {
+	switch t.t._type {
+	case C.GGML_TYPE_F32:
+		return ml.DTypeF32
+	case C.GGML_TYPE_I32:
+		return ml.DTypeI32
+	default:
+		return ml.DTypeOther
+	}
+}
+
+func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		C.ggml_add(ctx.(*Context).c, t.t, t2.(*Tensor).t),
+	}
+}
+
+func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
+	if len(s) > 0 {
+		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
+	}
+
+	return t
+}
+
+func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
+	return &Tensor{
+		C.ggml_concat(ctx.(*Context).c, t.t, t2.(*Tensor).t, C.int(dim)),
+	}
+}
+
+func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		C.ggml_cont(ctx.(*Context).c, t.t),
+	}
+}
+
+func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		C.ggml_mul(ctx.(*Context).c, t.t, t2.(*Tensor).t),
+	}
+}
+
+func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		C.ggml_mul_mat(ctx.(*Context).c, t.t, t2.(*Tensor).t),
+	}
+}
+
+func (t *Tensor) Norm(ctx ml.Context, eps float32) ml.Tensor {
+	return &Tensor{
+		C.ggml_norm(ctx.(*Context).c, t.t, (C.float)(eps)),
+	}
+}
+
+func (t *Tensor) RMSNorm(ctx ml.Context, eps float32) ml.Tensor {
+	return &Tensor{
+		C.ggml_rms_norm(ctx.(*Context).c, t.t, C.float(eps)),
+	}
+}
+
+func (t *Tensor) Pad(ctx ml.Context, shape ...int64) ml.Tensor {
+	if len(shape) != 4 {
+		panic("expected 4 dimensions")
+	}
+
+	return &Tensor{
+		C.ggml_pad(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
+	}
+}
+
+func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
+	if len(shape) != 4 {
+		panic("expected 4 dimensions")
+	}
+
+	return &Tensor{
+		C.ggml_permute(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
+	}
+}
+
+func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		C.ggml_get_rows(ctx.(*Context).c, t.t, t2.(*Tensor).t),
+	}
+}
+
+func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		C.ggml_cpy(ctx.(*Context).c, t.t, t2.(*Tensor).t),
+	}
+}
+
+func (t *Tensor) Reshape(ctx ml.Context, shape ...int64) ml.Tensor {
+	switch len(shape) {
+	case 1:
+		return &Tensor{
+			C.ggml_reshape_1d(ctx.(*Context).c, t.t, C.int64_t(shape[0])),
+		}
+	case 2:
+		return &Tensor{
+			C.ggml_reshape_2d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
+		}
+	case 3:
+		return &Tensor{
+			C.ggml_reshape_3d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
+		}
+	case 4:
+		return &Tensor{
+			C.ggml_reshape_4d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
+		}
+	default:
+		panic("unsupported number of dimensions")
+	}
+}
+
+func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
+	return &Tensor{
+		C.ggml_scale(ctx.(*Context).c, t.t, (C.float)(s)),
+	}
+}
+
+func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		C.ggml_soft_max(ctx.(*Context).c, t.t),
+	}
+}
+
+func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		C.ggml_tanh_inplace(ctx.(*Context).c, t.t),
+	}
+}
+
+func (t *Tensor) Unpad(ctx ml.Context, shape ...int64) ml.Tensor {
+	if len(shape) != 4 {
+		panic("expected 4 dimensions")
+	}
+
+	return &Tensor{
+		C.ggml_unpad(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
+	}
+}
+
+func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
+	switch len(shape) {
+	case 1:
+		return &Tensor{
+			C.ggml_view_1d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.size_t(offset)),
+		}
+	case 3:
+		return &Tensor{
+			C.ggml_view_2d(ctx.(*Context).c, t.t,
+				C.int64_t(shape[0]), C.int64_t(shape[2]),
+				C.size_t(shape[1]),
+				C.size_t(offset)),
+		}
+	case 5:
+		return &Tensor{
+			C.ggml_view_3d(ctx.(*Context).c, t.t,
+				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
+				C.size_t(shape[1]), C.size_t(shape[3]),
+				C.size_t(offset)),
+		}
+	case 7:
+		return &Tensor{
+			C.ggml_view_4d(ctx.(*Context).c, t.t,
+				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
+				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
+				C.size_t(offset)),
+		}
+	default:
+		panic("unsupported number of dimensions")
+	}
+}
+
+const (
+	ropeTypeNorm C.int = iota
+)
+
+func (t *Tensor) Rope(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
+	return &Tensor{
+		C.ggml_rope_ext(
+			ctx.(*Context).c, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			C.int(ropeDim),
+			131072,       // YaRN n_ctx_train
+			ropeTypeNorm, // ROPE_TYPE_NORM
+			C.float(ropeBase),
+			C.float(ropeScale),
+			0.,  // YaRN ext_factor
+			1.,  // YaRN attn_factor
+			32., // YaRN beta_fast
+			1.,  // YaRN beta_slow
+		),
+	}
+}
+
+func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		C.ggml_gelu_inplace(ctx.(*Context).c, t.t),
+	}
+}
+
+func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		C.ggml_silu_inplace(ctx.(*Context).c, t.t),
+	}
+}
+
+func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
+	return &Tensor{
+		C.ggml_conv_2d(ctx.(*Context).c, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
+	}
+}
--- a/ml/backend/ggml/backend_cpu.go
+++ b/ml/backend/ggml/backend_cpu.go
@ -0,0 +1,8 @@
+package ggml
+
+// #include "ggml-backend.h"
+import "C"
+
+func newCPUBackend() *C.struct_ggml_backend {
+	return C.ggml_backend_cpu_init()
+}
--- a/ml/backend/ggml/backend_darwin.go
+++ b/ml/backend/ggml/backend_darwin.go
@ -0,0 +1,13 @@
+package ggml
+
+//go:generate sh -c "echo \"// Code generated $(date). DO NOT EDIT.\n\" >ggml-metal-embed.metal"
+//go:generate sh -c "sed -e '/#include \"ggml-common.h\"/r ggml-common.h' -e '/#include \"ggml-common.h\"/d' ggml-metal.metal >>ggml-metal-embed.metal"
+
+// #cgo arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DGGML_METAL_NDEBUG
+// #cgo arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
+// #include "ggml-metal.h"
+import "C"
+
+func newBackend() *C.struct_ggml_backend {
+	return C.ggml_backend_metal_init()
+}
--- a/ml/backend/ggml/backend_debug.go
+++ b/ml/backend/ggml/backend_debug.go
@ -0,0 +1,6 @@
+//go:build debug
+
+package ggml
+
+// #cgo CPPFLAGS: -DOLLAMA_DEBUG
+import "C"
--- a/ml/backend/ggml/backend_linux.go
+++ b/ml/backend/ggml/backend_linux.go
@ -0,0 +1,10 @@
+package ggml
+
+// #cgo CPPFLAGS: -D_GNU_SOURCE
+// #cgo LDFLAGS: -lm
+// #include "ggml-backend.h"
+import "C"
+
+func newBackend() *C.struct_ggml_backend {
+	return newCPUBackend()
+}
--- a/ml/backend/ggml/backend_windows.go
+++ b/ml/backend/ggml/backend_windows.go
@ -0,0 +1,10 @@
+package ggml
+
+// #cgo CPPFLAGS: -D_WIN32_WINNT=0x602
+// #cgo LDFLAGS: -lmsvcrt -static -static-libgcc -static-libstdc++
+// #include "ggml-backend.h"
+import "C"
+
+func newBackend() *C.struct_ggml_backend {
+	return newCPUBackend()
+}
--- a/ml/backend/ggml/ggml-aarch64.c
+++ b/ml/backend/ggml/ggml-aarch64.c
--- a/ml/backend/ggml/ggml-aarch64.h
+++ b/ml/backend/ggml/ggml-aarch64.h
@ -0,0 +1,65 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
+
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+// GEMV
+void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+// GEMM
+void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/ml/backend/ggml/ggml-alloc.c
+++ b/ml/backend/ggml/ggml-alloc.c
--- a/ml/backend/ggml/ggml-alloc.h
+++ b/ml/backend/ggml/ggml-alloc.h
@ -0,0 +1,102 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
+
+// Tensor allocator
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
+
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+
+// Graph allocator
+/*
+  Example usage:
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+
+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
+
+    // allocate the graph
+    struct ggml_cgraph * graph = build_graph(batch);
+    ggml_gallocr_alloc_graph(galloc, graph);
+
+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
+
+    // evaluate the graph
+    ggml_backend_graph_compute(backend, graph);
+*/
+
+// special tensor flags for use with the graph allocator:
+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+//   ggml_set_output(): output tensors are never freed and never overwritten
+
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
+
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+
+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-backend-impl.h
+++ b/ml/backend/ggml/ggml-backend-impl.h
@ -0,0 +1,180 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+// ggml-backend internal header
+
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    //
+    // Backend buffer
+    //
+
+    // buffer type
+    typedef void * ggml_backend_buffer_type_context_t;
+
+    struct ggml_backend_buffer_type_i {
+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
+        // allocate a buffer of this type
+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
+        // tensor alignment
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft);
+        // max buffer size that can be allocated
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft);
+        // data size needed to allocate the tensor, including padding
+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
+        // check if tensor data is in host memory
+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
+    };
+
+    struct ggml_backend_buffer_type {
+        struct ggml_backend_buffer_type_i  iface;
+        ggml_backend_buffer_type_context_t context;
+    };
+
+    // buffer
+    typedef void * ggml_backend_buffer_context_t;
+
+    struct ggml_backend_buffer_i {
+        const char * (*GGML_CALL get_name)      (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL free_buffer)   (ggml_backend_buffer_t buffer);
+        void *       (*GGML_CALL get_base)      (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void         (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+        void         (*GGML_CALL set_tensor)    (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*GGML_CALL get_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool         (*GGML_CALL cpy_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
+        void         (*GGML_CALL clear)         (ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*GGML_CALL reset)         (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+    };
+
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_i  iface;
+        ggml_backend_buffer_type_t    buft;
+        ggml_backend_buffer_context_t context;
+        size_t size;
+        enum ggml_backend_buffer_usage usage;
+    };
+
+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
+                   ggml_backend_buffer_type_t      buft,
+            struct ggml_backend_buffer_i           iface,
+                   ggml_backend_buffer_context_t   context,
+                   size_t                          size);
+
+    // do not use directly, use ggml_backend_tensor_copy instead
+    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // buffer that contains a collection of buffers
+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+
+    //
+    // Backend
+    //
+
+    typedef void * ggml_backend_context_t;
+
+    struct ggml_backend_i {
+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
+
+        void (*GGML_CALL free)(ggml_backend_t backend);
+
+        // buffer allocation
+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
+
+        // (optional) asynchronous tensor data access
+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // (optional) complete all pending operations
+        void (*GGML_CALL synchronize)(ggml_backend_t backend);
+
+        // compute graph with a plan (not used currently)
+        // create a new plan for a graph
+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
+        void                      (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
+        // compute the graph with the plan
+        enum ggml_status          (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+        // compute graph without a plan (async)
+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // check if the backend can compute an operation
+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+
+        // check if the backend can use tensors allocated in a buffer type
+        bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+
+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
+        // these should be expensive operations with large batch sizes that may benefit from running on this backend
+        // even if the weight has to be copied from the CPU temporarily
+        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+
+        // (optional) event synchronization
+        // create a new event that can record events on this backend instance
+        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
+        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
+        // record an event on the backend instance that created it
+        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
+        // wait for an event on on a different backend instance
+        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
+        // block until an event is recorded
+        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
+    };
+
+    struct ggml_backend {
+        ggml_guid_t guid;
+
+        struct ggml_backend_i iface;
+        ggml_backend_context_t context;
+    };
+
+    struct ggml_backend_event {
+        ggml_backend_t backend;
+        void * context;
+    };
+
+    //
+    // Backend registry
+    //
+
+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
+
+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-backend.c
+++ b/ml/backend/ggml/ggml-backend.c
--- a/ml/backend/ggml/ggml-backend.h
+++ b/ml/backend/ggml/ggml-backend.h
@ -0,0 +1,267 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend_event * ggml_backend_event_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+
+    //
+    // Backend buffer
+    //
+
+    // buffer type
+    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
+    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
+
+    // buffer
+    enum ggml_backend_buffer_usage {
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
+    };
+
+    GGML_API           const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API           void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API           void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API GGML_CALL void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API           bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API           void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API           enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
+    GGML_API           ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API           void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+
+    //
+    // Backend
+    //
+
+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    // "offset" refers to the offset of the tensor data for setting/getting data
+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // asynchronous copy
+    // the copy is performed after all the currently queued operations in backend_src
+    // backend_dst will wait for the copy to complete before performing other operations
+    // automatic fallback to sync copy if async is not supported
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // events
+    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
+    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
+    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event);
+
+    //
+    // CPU backend
+    //
+
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    // Create a backend buffer from an existing pointer
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+
+    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
+    //
+    // Backend registry
+    //
+
+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+
+    GGML_API size_t                     ggml_backend_reg_get_count(void);
+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
+    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
+    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
+
+    //
+    // Backend scheduler
+    //
+
+    // The backend scheduler allows for multiple backends to be used together
+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+    // The backends are selected based on:
+    // - the backend that supports the operation
+    // - the location of the pre-allocated tensors (e.g. the weights)
+    /*
+      Example usage:
+
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
+        // preferrably to run on the same backend as the buffer
+        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+
+        // initialize buffers from a max size graph (optional)
+        reserve_graph = build_graph(sched, max_batch_size);
+
+        // manually assign nodes to a backend (optional, should not be needed in most cases)
+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
+
+        ggml_backend_sched_reserve(sched, reserve_graph);
+
+        // compute
+        graph = build_graph(sched);
+        ggml_backend_sched_graph_compute(sched, graph);
+
+        // if there are graph inputs:
+        ggml_backend_sched_reset(sched);
+        ggml_backend_sched_alloc_graph(sched, graph);
+        ggml_backend_tensor_set(input_tensor, ...);
+        ggml_backend_sched_graph_compute(sched, graph);
+    }
+    */
+
+    struct ggml_backend_sched;
+    typedef struct ggml_backend_sched * ggml_backend_sched_t;
+
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
+    // Initialize a backend scheduler
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+
+    // Initialize backend buffers from a measure graph
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+
+    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
+
+    // Get the number of splits of the last graph
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
+
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+
+    // Allocate and compute graph on the backend scheduler
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
+
+    // Reset all assignments and allocators - must be called before changing the node backends
+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
+
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-blas.h
+++ b/ml/backend/ggml/ggml-blas.h
@ -0,0 +1,49 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
+
+GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
+
+// number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
+GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-common.h
+++ b/ml/backend/ggml/ggml-common.h
--- a/ml/backend/ggml/ggml-cpu-impl.h
+++ b/ml/backend/ggml/ggml-cpu-impl.h
@ -0,0 +1,640 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+// GGML CPU internal header
+
+#include "ggml.h"
+#include "ggml-impl.h"
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+//#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h>   // fabsf
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_MSC_VER)
+
+#define m512bh(p) p
+#define m512i(p) p
+
+#else
+
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+
+#endif
+
+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+    ggml_bf16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h.bits = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+    return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#endif
+
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#include <sys/prctl.h>
+#endif
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+
+typedef uint16_t ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
+typedef __fp16 ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif // _MSC_VER
+
+#if !defined(__aarch64__)
+
+// 32-bit ARM compatibility
+
+// vaddlvq_s16
+// vpaddq_s16
+// vpaddq_s32
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+// vzip1_u8
+// vzip2_u8
+
+inline static int32_t vaddlvq_s16(int16x8_t v) {
+    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
+    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+    return vcombine_s32(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
+}
+
+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
+}
+
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
+    ggml_int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
+    ggml_uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
+    ggml_uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x2_t {
+    int8x16_t val[2];
+} ggml_int8x16x2_t;
+
+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
+    ggml_int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+    ggml_int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
+// NOTE: not tested
+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+// NOTE: not tested
+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+    uint8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+#else
+
+#define ggml_int16x8x2_t  int16x8x2_t
+#define ggml_uint8x16x2_t uint8x16x2_t
+#define ggml_uint8x16x4_t uint8x16x4_t
+#define ggml_int8x16x2_t  int8x16x2_t
+#define ggml_int8x16x4_t  int8x16x4_t
+
+#define ggml_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_vld1q_u8_x2  vld1q_u8_x2
+#define ggml_vld1q_u8_x4  vld1q_u8_x4
+#define ggml_vld1q_s8_x2  vld1q_s8_x2
+#define ggml_vld1q_s8_x4  vld1q_s8_x4
+#define ggml_vqtbl1q_s8   vqtbl1q_s8
+#define ggml_vqtbl1q_u8   vqtbl1q_u8
+
+#endif // !defined(__aarch64__)
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif // !defined(__ARM_FEATURE_DOTPROD)
+
+#endif // defined(__ARM_NEON)
+
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    ggml_fp16_internal_t tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    ggml_fp16_t res;
+    ggml_fp16_internal_t tmp = f;
+    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+    return res;
+}
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#if defined(__loongarch64)
+#if defined(__loongarch_asx)
+#include <lasxintrin.h>
+#endif
+#if defined(__loongarch_sx)
+#include <lsxintrin.h>
+#endif
+#endif
+
+#if defined(__loongarch_asx)
+
+typedef union {
+    int32_t i;
+    float f;
+} ft_union;
+
+/* float type data load instructions */
+static __m128 __lsx_vreplfr2vr_s(float val) {
+    ft_union fi_tmpval = {.f = val};
+    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+
+static __m256 __lasx_xvreplfr2vr_s(float val) {
+    ft_union fi_tmpval = {.f = val};
+    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
+}
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-cuda.h
+++ b/ml/backend/ggml/ggml-cuda.h
@ -0,0 +1,75 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_HIPBLAS
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#elif defined(GGML_USE_MUSA)
+#define GGML_CUDA_NAME "MUSA"
+#define GGML_CUBLAS_NAME "muBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_CUDA_MAX_DEVICES       16
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
+
+GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
+
+// device buffer
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+
+GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
+
+GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+
+GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
+#ifdef  __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-debug.c
+++ b/ml/backend/ggml/ggml-debug.c
@ -0,0 +1,110 @@
+#include <string.h>
+
+#include "ggml-debug.h"
+
+static int mul(int64_t *dims, int ndims) {
+    int result = 1;
+    for (int i = 0; i < ndims; i++) {
+        result *= dims[i];
+    }
+
+    return result;
+}
+
+static void repeat(char c, int n) {
+    for (int i = 0; i < n; i++) {
+        fprintf(stderr, "%c", c);
+    }
+}
+
+static void print_tensor(const void *tensor, void (*cb)(const void *, int),
+                         int shape,
+                         int64_t *dims, int ndims, int stride,
+                         int nitems, int pad) {
+    fprintf(stderr, "[");
+    for (int i = 0; i < dims[0]; i++) {
+        if (i >= nitems && i < dims[0] - nitems) {
+            fprintf(stderr, "... (%lld more), ", dims[0] - 2 * nitems);
+            int skip = dims[0] - 2 * nitems;
+            if (ndims > 1) {
+                stride += mul(dims + 1, ndims - 1) * skip;
+                repeat('\n', ndims - 1);
+                repeat(' ', shape - ndims + 1 + pad);
+            }
+            i += skip - 1;
+        } else if (ndims > 1) {
+            print_tensor(tensor, cb, shape, dims + 1, ndims - 1, stride,
+                         nitems, pad);
+            stride += mul(dims + 1, ndims - 1);
+            if (i < dims[0] - 1) {
+                fprintf(stderr, ", ");
+                repeat('\n', ndims - 1);
+                repeat(' ', shape - ndims + 1 + pad);
+            }
+        } else {
+            cb(tensor, stride + i);
+            if (i < dims[0] - 1) {
+                fprintf(stderr, ", ");
+            }
+        }
+    }
+    fprintf(stderr, "]");
+}
+
+static void print_tensor_f16(const void *tensor, int i) {
+    fprintf(stderr, "%f", ggml_fp16_to_fp32(((const ggml_fp16_t *)tensor)[i]));
+}
+
+static void print_tensor_f32(const void *tensor, int i) {
+    fprintf(stderr, "%f", ((const float *)tensor)[i]);
+}
+
+static void print_tensor_i32(const void *tensor, int i) {
+    fprintf(stderr, "%d", ((const int32_t *)tensor)[i]);
+}
+
+static void ggml_debug_tensor(const struct ggml_tensor *tensor, bool verbose, const char *prefix, int indent) {
+    fprintf(stderr, "%s%s %s (%s): [%lld %lld %lld %lld]\n", prefix, tensor->name,
+            ggml_op_name(tensor->op), ggml_type_name(tensor->type), tensor->ne[0],
+            tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+
+    if (!verbose) {
+        return;
+    }
+
+    for (int i = 0; i < indent; i++) {
+        fprintf(stderr, " ");
+    }
+
+    switch (tensor->type) {
+    case GGML_TYPE_F16:
+        print_tensor(ggml_get_data(tensor), print_tensor_f16, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    case GGML_TYPE_F32:
+        print_tensor(ggml_get_data(tensor), print_tensor_f32, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    case GGML_TYPE_I32:
+        print_tensor(ggml_get_data(tensor), print_tensor_i32, ggml_n_dims(tensor),
+                     (int64_t *)tensor->ne, ggml_n_dims(tensor), 0, 3, indent);
+        break;
+    default:
+        fprintf(stderr, "<unsupported type>\n");
+        return;
+    }
+
+    fprintf(stderr, "\n");
+}
+
+void ggml_debug(const struct ggml_tensor *tensor, bool verbose) {
+    ggml_debug_tensor(tensor, verbose, ">>> ", 4);
+
+    if (tensor->src[0] != NULL) {
+        ggml_debug_tensor(tensor->src[0], verbose, " ?? ", 4);
+    }
+
+    if (tensor->src[1] != NULL) {
+        ggml_debug_tensor(tensor->src[1], verbose, " ?? ", 4);
+    }
+}
--- a/ml/backend/ggml/ggml-debug.h
+++ b/ml/backend/ggml/ggml-debug.h
@ -0,0 +1,3 @@
+#include "ggml.h"
+
+void ggml_debug(const struct ggml_tensor *tensor, bool verbose);
--- a/ml/backend/ggml/ggml-impl.h
+++ b/ml/backend/ggml/ggml-impl.h
@ -0,0 +1,212 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+// GGML internal header
+
+#include "ggml.h"
+
+#include <assert.h>
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// static_assert should be a #define, but if it's not,
+// fall back to the _Static_assert C11 keyword.
+// if C99 - static_assert is noop
+// ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef __cplusplus
+#ifndef static_assert
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+#define static_assert(cond, msg) _Static_assert(cond, msg)
+#else
+#define static_assert(cond, msg) struct global_scope_noop_trick
+#endif
+#endif
+#endif
+
+// bitset
+
+typedef uint32_t ggml_bitset_t;
+
+static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
+#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
+#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
+
+static size_t ggml_bitset_size(size_t n) {
+    return (n + BITSET_MASK) >> BITSET_SHR;
+}
+
+static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
+    return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
+}
+
+static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
+    bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
+}
+
+static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
+    bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
+}
+
+// hash set
+
+#define GGML_HASHSET_FULL ((size_t)-1)
+#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
+
+struct ggml_hash_set {
+    size_t size;
+    ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+    struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
+};
+
+struct ggml_hash_set ggml_hash_set_new(size_t size);
+void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);
+
+// returns the minimum size for a hash set that can hold min_sz elements
+size_t ggml_hash_size(size_t min_sz);
+
+// remove all elements from the hash set
+void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
+
+// returns true if key is in the hash set
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// return index, asserts if table is full
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// hash function for ggml_tensor
+static inline size_t ggml_hash(const struct ggml_tensor * p) {
+    // the last 4 bits are always zero due to alignment
+    return (size_t)(uintptr_t)p >> 4;
+}
+
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
+        i = (i + 1) % hash_set->size;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_HASHSET_FULL;
+        }
+    }
+    return i;
+}
+
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t i = ggml_hash_find(hash_set, key);
+    return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
+}
+
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    do {
+        if (!ggml_bitset_get(hash_set->used, i)) {
+            ggml_bitset_set(hash_set->used, i);
+            hash_set->keys[i] = key;
+            return i;
+        }
+        if (hash_set->keys[i] == key) {
+            return GGML_HASHSET_ALREADY_EXISTS;
+        }
+        i = (i + 1) % hash_set->size;
+    } while (i != h);
+
+    // visited all hash table entries -> not found
+    GGML_ABORT("fatal error");
+}
+
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    do {
+        if (!ggml_bitset_get(hash_set->used, i)) {
+            ggml_bitset_set(hash_set->used, i);
+            hash_set->keys[i] = key;
+            return i;
+        }
+        if (hash_set->keys[i] == key) {
+            return i;
+        }
+        i = (i + 1) % hash_set->size;
+    } while (i != h);
+
+    // visited all hash table entries -> not found
+    GGML_ABORT("fatal error");
+}
+
+// computation graph
+
+enum ggml_cgraph_eval_order {
+    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
+struct ggml_cgraph {
+    int size;
+    int n_nodes;
+    int n_leafs;
+
+    struct ggml_tensor ** nodes;
+    struct ggml_tensor ** grads;
+    struct ggml_tensor ** leafs;
+
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+};
+
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-metal-embed.metal
+++ b/ml/backend/ggml/ggml-metal-embed.metal
--- a/ml/backend/ggml/ggml-metal.h
+++ b/ml/backend/ggml/ggml-metal.h
@ -0,0 +1,88 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stddef.h>
+#include <stdbool.h>
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
+GGML_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
+GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+
+GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+
+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml-metal.metal
+++ b/ml/backend/ggml/ggml-metal.metal
--- a/ml/backend/ggml/ggml-metal_darwin_arm64.m
+++ b/ml/backend/ggml/ggml-metal_darwin_arm64.m
--- a/ml/backend/ggml/ggml-metal_darwin_arm64.s
+++ b/ml/backend/ggml/ggml-metal_darwin_arm64.s
@ -0,0 +1,6 @@
+.section __DATA, __ggml_metallib
+.globl _ggml_metallib_start
+_ggml_metallib_start:
+.incbin "ggml-metal-embed.metal"
+.globl _ggml_metallib_end
+_ggml_metallib_end:
--- a/ml/backend/ggml/ggml-quants.c
+++ b/ml/backend/ggml/ggml-quants.c
--- a/ml/backend/ggml/ggml-quants.h
+++ b/ml/backend/ggml/ggml-quants.h
@ -0,0 +1,173 @@
+/**
+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+
+#include "ggml.h"
+
+// GGML internal header
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantization
+void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
+void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq3_s_ref  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_s_ref  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+
+// Dequantization
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+
+// Dot product
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
+void iq2xs_init_impl(enum ggml_type type);
+void iq2xs_free_impl(enum ggml_type type);
+void iq3xs_init_impl(int grid_size);
+void iq3xs_free_impl(int grid_size);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/backend/ggml/ggml.c
+++ b/ml/backend/ggml/ggml.c
--- a/ml/backend/ggml/ggml.h
+++ b/ml/backend/ggml/ggml.h
--- a/ml/backend/ggml/sgemm.h
+++ b/ml/backend/ggml/sgemm.h
@ -0,0 +1,14 @@
+#pragma once
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
+                     const void *, int64_t, void *, int64_t, int, int,
+                     int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ml/nn/convolution.go
+++ b/ml/nn/convolution.go
@ -0,0 +1,11 @@
+package nn
+
+import "github.com/ollama/ollama/ml"
+
+type Conv2D struct {
+	Weight ml.Tensor `ggml:"weight"`
+}
+
+func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
+	return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
+}
--- a/ml/nn/embedding.go
+++ b/ml/nn/embedding.go
@ -0,0 +1,11 @@
+package nn
+
+import "github.com/ollama/ollama/ml"
+
+type Embedding struct {
+	Weight ml.Tensor `ggml:"weight"`
+}
+
+func (m *Embedding) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
+	return m.Weight.Rows(ctx, hiddenState)
+}
--- a/ml/nn/linear.go
+++ b/ml/nn/linear.go
@ -0,0 +1,17 @@
+package nn
+
+import "github.com/ollama/ollama/ml"
+
+type Linear struct {
+	Weight ml.Tensor `ggml:"weight"`
+	Bias   ml.Tensor `ggml:"bias"`
+}
+
+func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	t = m.Weight.Mulmat(ctx, t)
+	if m.Bias != nil {
+		t = t.Add(ctx, m.Bias)
+	}
+
+	return t
+}
--- a/ml/nn/normalization.go
+++ b/ml/nn/normalization.go
@ -0,0 +1,33 @@
+package nn
+
+import (
+	"github.com/ollama/ollama/ml"
+)
+
+type LayerNorm struct {
+	Weight ml.Tensor `ggml:"weight"`
+	Bias   ml.Tensor `ggml:"bias"`
+}
+
+func (m *LayerNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
+	t = t.Norm(ctx, eps).Mul(ctx, m.Weight)
+	if m.Bias != nil {
+		t = t.Add(ctx, m.Bias)
+	}
+
+	return t
+}
+
+type RMSNorm struct {
+	Weight ml.Tensor `ggml:"weight"`
+	Bias   ml.Tensor `ggml:"bias"`
+}
+
+func (m *RMSNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
+	t = t.RMSNorm(ctx, eps).Mul(ctx, m.Weight)
+	if m.Bias != nil {
+		t = t.Add(ctx, m.Bias)
+	}
+
+	return t
+}
--- a/model/cmd/main.go
+++ b/model/cmd/main.go
@ -0,0 +1,154 @@
+package main
+
+import (
+	"errors"
+	"flag"
+	"fmt"
+	"image"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+
+	"github.com/ollama/ollama/cache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	_ "github.com/ollama/ollama/model/llama"
+	_ "github.com/ollama/ollama/model/mllama"
+	"github.com/ollama/ollama/sample"
+)
+
+var args struct {
+	n     int
+	debug bool
+	image string
+	cache bool
+}
+
+func temp() error {
+	flag.IntVar(&args.n, "n", 10, "number of samples")
+	flag.BoolVar(&args.debug, "debug", false, "enable debug logging")
+	flag.StringVar(&args.image, "image", "", "path to image file")
+	flag.BoolVar(&args.cache, "cache", false, "enable KV cache")
+
+	flag.Parse()
+
+	if len(flag.Args()) != 1 {
+		return fmt.Errorf("usage: %s path/to/file <prompt\n", filepath.Base(os.Args[0]))
+	}
+
+	level := slog.LevelInfo
+	if args.debug {
+		level = slog.LevelDebug
+	}
+
+	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
+		Level:     level,
+		AddSource: true,
+		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
+			if attr.Key == slog.SourceKey {
+				source := attr.Value.Any().(*slog.Source)
+				source.File = filepath.Base(source.File)
+			}
+
+			return attr
+		},
+	})))
+
+	m, err := model.New(flag.Arg(0))
+	if err != nil {
+		return err
+	}
+
+	prompt, err := io.ReadAll(os.Stdin)
+	if err != nil {
+		return err
+	}
+
+	inputIDs, err := m.(model.TextProcessor).Encode(string(prompt))
+	if err != nil {
+		return err
+	}
+
+	var opts []model.OptionsFunc
+	if args.cache {
+		opts = append(opts, model.WithCache(&cache.Simple{
+			Capacity: 2048,
+			DType:    ml.DTypeF32,
+		}))
+	}
+
+	if args.image != "" {
+		if err := func() error {
+			f, err := os.Open(args.image)
+			if err != nil {
+				return err
+			}
+			defer f.Close()
+
+			img, _, err := image.Decode(f)
+			if err != nil {
+				return err
+			}
+
+			opts = append(opts, model.WithImage(img))
+			return nil
+		}(); err != nil {
+			return err
+		}
+	}
+
+	var offset int
+	for range args.n {
+		logit, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
+		if err != nil {
+			return err
+		}
+
+		f32s := logit.Floats()
+		f64s := make([]float64, len(f32s))
+		for i, f32 := range f32s {
+			f64s[i] = float64(f32)
+		}
+
+		// do sampling
+		f64s, err = sample.Sample(f64s, sample.Greedy())
+		if err != nil {
+			return err
+		}
+
+		var outputIDs []int32
+		for _, f64 := range f64s {
+			if !m.(model.TextProcessor).Is(uint32(f64), model.SpecialEOS) {
+				outputIDs = append(outputIDs, int32(f64))
+			}
+		}
+
+		if len(outputIDs) == 0 {
+			break
+		}
+
+		s, err := m.(model.TextProcessor).Decode(outputIDs)
+		if errors.Is(err, io.EOF) {
+			break
+		} else if err != nil {
+			return err
+		}
+
+		fmt.Print(s)
+
+		inputIDs = append(inputIDs, outputIDs...)
+		if args.cache {
+			offset = len(inputIDs) - 1
+		}
+	}
+
+	return nil
+}
+
+func main() {
+	if err := temp(); err != nil {
+		fmt.Println("err", err)
+		os.Exit(1)
+	}
+}
--- a/model/llama/model.go
+++ b/model/llama/model.go
@ -0,0 +1,147 @@
+package llama
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type Options struct {
+	RopeFactors                      ml.Tensor `ggml:"rope_freqs.weight"`
+	hiddenSize, numHeads, numKVHeads int64
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
+}
+
+type Model struct {
+	model.Base
+
+	TextProcessor
+
+	TokenEmbedding *nn.Embedding `ggml:"token_embd"`
+	Layers         []Layer       `ggml:"blk"`
+	OutputNorm     *nn.RMSNorm   `ggml:"output_norm"`
+	Output         *nn.Linear    `ggml:"output"`
+
+	*Options
+}
+
+func New(c ml.Config) (model.Model, error) {
+	return &Model{
+		TextProcessor: newTextProcessor(c),
+		Layers:        make([]Layer, c.Uint("block_count")),
+		Options: &Options{
+			hiddenSize: int64(c.Uint("embedding_length")),
+			numHeads:   int64(c.Uint("attention.head_count")),
+			numKVHeads: int64(c.Uint("attention.head_count_kv")),
+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:   c.Float("rope.freq_base"),
+			ropeScale:  c.Float("rope.freq_scale", 1),
+			ropeDim:    c.Uint("rope.dimension_count"),
+		},
+	}, nil
+}
+
+type SelfAttention struct {
+	Query  *nn.Linear `ggml:"attn_q"`
+	Key    *nn.Linear `ggml:"attn_k"`
+	Value  *nn.Linear `ggml:"attn_v"`
+	Output *nn.Linear `ggml:"attn_output"`
+}
+
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+
+	q := sa.Query.Forward(ctx, hiddenState)
+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	q = q.Rope(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+
+	k := sa.Key.Forward(ctx, hiddenState)
+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	k = k.Rope(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+
+	v := sa.Value.Forward(ctx, hiddenState)
+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	k, v = cache.Put(ctx, k, v, cache.Options)
+
+	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	kq := k.Mulmat(ctx, q)
+	kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
+	kq = kq.Softmax(ctx)
+
+	kqv := v.Mulmat(ctx, kq)
+	kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+type MLP struct {
+	Up   *nn.Linear `ggml:"ffn_up"`
+	Down *nn.Linear `ggml:"ffn_down"`
+	Gate *nn.Linear `ggml:"ffn_gate"`
+}
+
+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `ggml:"attn_norm"`
+	SelfAttention *SelfAttention
+	MLPNorm       *nn.RMSNorm `ggml:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
+	inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
+	if err != nil {
+		return nil, err
+	}
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+
+	for i, layer := range m.Layers {
+		hiddenState = layer.Forward(ctx, hiddenState, positions, opts.Cache.Sub(i), m.Options)
+	}
+
+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+	hiddenState = m.Output.Forward(ctx, hiddenState)
+
+	outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	return hiddenState.Rows(ctx, outputs), nil
+}
+
+func init() {
+	model.Register("llama", New)
+}
--- a/model/llama/process_text.go
+++ b/model/llama/process_text.go
@ -0,0 +1,25 @@
+package llama
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+)
+
+type TextProcessor struct {
+	model.BytePairEncoding
+}
+
+func newTextProcessor(c ml.Config) TextProcessor {
+	return TextProcessor{
+		BytePairEncoding: model.BytePairEncoding{
+			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			Vocabulary: &model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
+			},
+		},
+	}
+}
--- a/model/mllama/model.go
+++ b/model/mllama/model.go
@ -0,0 +1,90 @@
+package mllama
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type Model struct {
+	model.Base
+
+	*VisionModel `ggml:"v,vision"`
+	*TextModel
+
+	Projector *nn.Linear `ggml:"mm.0"`
+
+	ImageProcessor
+	TextProcessor
+}
+
+func New(c ml.Config) (model.Model, error) {
+	return &Model{
+		ImageProcessor: newImageProcessor(c),
+		VisionModel:    newVisionModel(c),
+		TextProcessor:  newTextProcessor(c),
+		TextModel:      newTextModel(c),
+	}, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
+	var crossAttentionStates ml.Tensor
+	if opts.Images != nil {
+		f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(opts.Images[0])
+		if err != nil {
+			return nil, err
+		}
+
+		pixelValues, err := ctx.FromFloatSlice(f32s,
+			m.ImageProcessor.imageSize,
+			m.ImageProcessor.imageSize,
+			m.ImageProcessor.numChannels,
+			m.ImageProcessor.maxNumTiles,
+		)
+		if err != nil {
+			return nil, err
+		}
+
+		aspectRatio, err := ctx.FromIntSlice([]int32{int32(aspectRatioID)}, 1)
+		if err != nil {
+			return nil, err
+		}
+
+		positions := make([]int32, 1601)
+		for i := range positions {
+			positions[i] = int32(i)
+		}
+
+		positionIDs, err := ctx.FromIntSlice(positions, len(positions))
+		if err != nil {
+			return nil, err
+		}
+
+		crossAttentionStates = m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
+		crossAttentionStates = m.Projector.Forward(ctx, crossAttentionStates)
+	}
+
+	inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
+	if err != nil {
+		return nil, err
+	}
+
+	positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
+	if err != nil {
+		return nil, err
+	}
+
+	// TODO: attention mask, cross attention mask
+	hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache)
+
+	outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
+	if err != nil {
+		return nil, err
+	}
+
+	return hiddenState.Rows(ctx, outputs), nil
+}
+
+func init() {
+	model.Register("mllama", New)
+}
--- a/model/mllama/model_text.go
+++ b/model/mllama/model_text.go
@ -0,0 +1,225 @@
+package mllama
+
+import (
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+)
+
+type TextSelfAttention struct {
+	Query  *nn.Linear `ggml:"attn_q"`
+	Key    *nn.Linear `ggml:"attn_k"`
+	Value  *nn.Linear `ggml:"attn_v"`
+	Output *nn.Linear `ggml:"attn_output"`
+}
+
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	query = query.Rope(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+
+	key := sa.Key.Forward(ctx, hiddenState)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	key = key.Rope(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
+
+	value := sa.Value.Forward(ctx, hiddenState)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	key, value = cache.Put(ctx, key, value, cache.Options)
+
+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	scores := key.Mulmat(ctx, query)
+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
+
+	if mask != nil {
+		scores = scores.Add(ctx, mask)
+	}
+
+	scores = scores.Softmax(ctx)
+
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+type TextMLP struct {
+	Up   *nn.Linear `ggml:"ffn_up"`
+	Down *nn.Linear `ggml:"ffn_down"`
+	Gate *nn.Linear `ggml:"ffn_gate"`
+}
+
+func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextModelOptions) ml.Tensor {
+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
+	return mlp.Down.Forward(ctx, hiddenState)
+}
+
+type TextSelfAttentionDecoderLayer struct {
+	AttentionNorm *nn.RMSNorm `ggml:"attn_norm"`
+	SelfAttention *TextSelfAttention
+
+	MLPNorm *nn.RMSNorm `ggml:"ffn_norm"`
+	MLP     *TextMLP
+}
+
+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+type TextCrossAttention struct {
+	QueryNorm *nn.RMSNorm `ggml:"cross_attn_q_norm"`
+	Query     *nn.Linear  `ggml:"cross_attn_q_proj"`
+	KeyNorm   *nn.RMSNorm `ggml:"cross_attn_k_norm"`
+	Key       *nn.Linear  `ggml:"cross_attn_k_proj"`
+	Value     *nn.Linear  `ggml:"cross_attn_v_proj"`
+	Output    *nn.Linear  `ggml:"cross_attn_o_proj"`
+}
+
+func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
+	batchSize := hiddenState.Dim(1)
+	headDim := opts.hiddenSize / opts.numHeads
+	numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
+
+	query := ca.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	query = ca.QueryNorm.Forward(ctx, query, opts.eps)
+
+	key := ca.Key.Forward(ctx, crossAttentionStates)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
+	key = ca.KeyNorm.Forward(ctx, key, opts.eps)
+
+	value := ca.Value.Forward(ctx, crossAttentionStates)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
+
+	// TODO cache key, value
+
+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	scores := key.Mulmat(ctx, query)
+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
+	scores = scores.Softmax(ctx)
+
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return ca.Output.Forward(ctx, attention)
+}
+
+type TextCrossAttentionDecoderLayer struct {
+	AttentionNorm  *nn.RMSNorm `ggml:"attn_norm"`
+	CrossAttention *TextCrossAttention
+	AttentionGate  ml.Tensor `ggml:"cross_attn_attn_gate"`
+
+	MLPNorm *nn.RMSNorm `ggml:"ffn_norm"`
+	MLP     *TextMLP
+	MLPGate ml.Tensor `ggml:"cross_attn_mlp_gate"`
+}
+
+func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
+	hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Mul(ctx, d.MLPGate.Tanh(ctx))
+	return hiddenState.Add(ctx, residual)
+}
+
+type TextDecoderLayer interface {
+	Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor
+}
+
+type TextDecoder struct {
+	Layers []TextDecoderLayer
+}
+
+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
+	for i, layer := range d.Layers {
+		if !slices.Contains(opts.crossAttentionLayers, uint32(i)) || crossAttentionStates != nil {
+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), opts)
+		}
+	}
+
+	return hiddenState
+}
+
+type TextModelOptions struct {
+	RopeFactors ml.Tensor `ggml:"rope_freqs.weight"`
+
+	hiddenSize, numHeads, numKVHeads int64
+	eps, ropeBase, ropeScale         float32
+	ropeDim                          uint32
+
+	crossAttentionLayers []uint32
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding `ggml:"token_embd"`
+	Transformer    *TextDecoder  `ggml:"blk"`
+	OutputNorm     *nn.RMSNorm   `ggml:"output_norm"`
+	Output         *nn.Linear    `ggml:"output"`
+
+	*TextModelOptions
+}
+
+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache) ml.Tensor {
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
+	return m.Output.Forward(ctx, hiddenState)
+}
+
+func newTextModel(c ml.Config) *TextModel {
+	var decoderLayers []TextDecoderLayer
+	for i := range c.Uint("block_count") {
+		var textDecoderLayer TextDecoderLayer
+		if slices.Contains(c.Uints("attention.cross_attention_layers"), i) {
+			textDecoderLayer = &TextCrossAttentionDecoderLayer{}
+		} else {
+			textDecoderLayer = &TextSelfAttentionDecoderLayer{}
+		}
+
+		decoderLayers = append(decoderLayers, textDecoderLayer)
+	}
+
+	return &TextModel{
+		Transformer: &TextDecoder{Layers: decoderLayers},
+		TextModelOptions: &TextModelOptions{
+			hiddenSize:           int64(c.Uint("embedding_length")),
+			numHeads:             int64(c.Uint("attention.head_count")),
+			numKVHeads:           int64(c.Uint("attention.head_count_kv")),
+			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:             c.Float("rope.freq_base"),
+			ropeScale:            c.Float("rope.freq_scale", 1),
+			ropeDim:              c.Uint("rope.dimension_count"),
+			crossAttentionLayers: c.Uints("attention.cross_attention_layers"),
+		},
+	}
+}
--- a/model/mllama/model_vision.go
+++ b/model/mllama/model_vision.go
@ -0,0 +1,234 @@
+package mllama
+
+import (
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+var batchSize int64 = 1
+
+type VisionSelfAttention struct {
+	Query  *nn.Linear `ggml:"attn_q"`
+	Key    *nn.Linear `ggml:"attn_k"`
+	Value  *nn.Linear `ggml:"attn_v"`
+	Output *nn.Linear `ggml:"attn_out"`
+
+	Gate ml.Tensor `ggml:"attn_gate"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	headDim := opts.hiddenSize / opts.numHeads
+
+	query := sa.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+
+	key := sa.Key.Forward(ctx, hiddenState)
+	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+
+	value := sa.Value.Forward(ctx, hiddenState)
+	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	scores := key.Mulmat(ctx, query)
+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
+	scores = scores.Softmax(ctx)
+
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+
+	hiddenState = sa.Output.Forward(ctx, attention)
+	if sa.Gate != nil {
+		hiddenState = hiddenState.Mul(ctx, sa.Gate)
+	}
+
+	return hiddenState
+}
+
+type VisionMLP struct {
+	Down *nn.Linear `ggml:"ffn_down"`
+	Up   *nn.Linear `ggml:"ffn_up"`
+
+	Gate ml.Tensor `ggml:"ffn_gate"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
+	hiddenState = mlp.Up.Forward(ctx, hiddenState)
+	if mlp.Gate != nil {
+		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
+	}
+
+	return hiddenState
+}
+
+type VisionEncoderLayer struct {
+	AttentionNorm *nn.LayerNorm `ggml:"ln1"`
+	SelfAttention *VisionSelfAttention
+
+	MLPNorm *nn.LayerNorm `ggml:"ln2"`
+	MLP     *VisionMLP
+}
+
+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	residual := hiddenState
+
+	// self attention
+	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
+	hiddenState = hiddenState.Add(ctx, residual)
+	residual = hiddenState
+
+	// feed forward
+	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
+	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
+	return hiddenState.Add(ctx, residual)
+}
+
+type VisionEncoder struct {
+	Layers []VisionEncoderLayer
+}
+
+func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []uint32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
+	var intermediateHiddenStates []ml.Tensor
+	for i, layer := range e.Layers {
+		if slices.Contains(intermediateLayersIndices, uint32(i)) {
+			intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int64{1}, hiddenState.Shape()...)...))
+		}
+
+		hiddenState = layer.Forward(ctx, hiddenState, opts)
+	}
+
+	return hiddenState, intermediateHiddenStates
+}
+
+type PrecomputedAspectRatioEmbedding struct {
+	Embedding *nn.Embedding
+	Gate      ml.Tensor `ggml:"gate"`
+}
+
+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
+	if e.Gate != nil {
+		embeddings = embeddings.Mul(ctx, e.Gate)
+	}
+
+	return hiddenState.Add(ctx, embeddings)
+}
+
+type PrecomputedPositionEmbedding struct {
+	PositionEmbedding     *nn.Embedding `ggml:"position_embd"`
+	PositionEmbeddingGate ml.Tensor     `ggml:"position_embd.gate"`
+
+	TilePositionEmbedding     *nn.Embedding `ggml:"tile_position_embd"`
+	TilePositionEmbeddingGate ml.Tensor     `ggml:"tile_position_embd.gate"`
+}
+
+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int64, opts *VisionModelOptions) ml.Tensor {
+	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
+	if e.PositionEmbeddingGate != nil {
+		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
+	}
+
+	hiddenState = hiddenState.Add(ctx, positionEmbedding)
+
+	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
+	if e.TilePositionEmbeddingGate != nil {
+		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
+	}
+
+	return hiddenState.Add(ctx, tilePositionEmbedding)
+}
+
+type VisionModelOptions struct {
+	hiddenSize, numHeads, numTiles int64
+	imageSize, patchSize           int
+	eps                            float32
+
+	intermediateLayersIndices []uint32
+}
+
+type VisionModel struct {
+	PatchEmbeddings *nn.Conv2D `ggml:"patch_embd"`
+
+	PreTilePositionEmbedding  *PrecomputedAspectRatioEmbedding `ggml:"pre_tile_position_embd"`
+	PostTilePositionEmbedding *PrecomputedAspectRatioEmbedding `ggml:"post_tile_position_embd"`
+	PositionEmbedding         *PrecomputedPositionEmbedding
+
+	PreLayerNorm   *nn.LayerNorm `ggml:"pre_ln"`
+	PostLayerNorm  *nn.LayerNorm `ggml:"post_ln"`
+	ClassEmbedding ml.Tensor     `ggml:"class_embd"`
+
+	Transformer       *VisionEncoder `ggml:"blk"`
+	GlobalTransformer *VisionEncoder `ggml:"global.blk"`
+
+	*VisionModelOptions
+}
+
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
+	numPatches := int64((m.imageSize / m.patchSize) * (m.imageSize / m.patchSize))
+	numPositions := numPatches
+	if m.ClassEmbedding != nil {
+		numPositions++
+	}
+
+	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
+	hiddenState = m.ClassEmbedding.Stack(ctx, 2, slices.Repeat([]ml.Tensor{m.ClassEmbedding}, int(m.numTiles)-1)...).Concat(ctx, hiddenState, 1)
+
+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
+	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
+
+	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
+	hiddenState = hiddenState.Pad(ctx, 0, numPaddingPatches, 0, 0)
+
+	hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), hiddenState.Dim(1)*hiddenState.Dim(2), batchSize)
+	hiddenState, intermediateHiddenStates := m.Transformer.Forward(ctx, hiddenState, m.intermediateLayersIndices, m.VisionModelOptions)
+
+	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
+
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
+
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
+	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
+
+	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
+	hiddenStates = hiddenStates.Reshape(ctx, int64(len(intermediateHiddenStates))*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
+	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+
+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
+	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
+	return hiddenState.Concat(ctx, hiddenStates, 0)
+}
+
+func newVisionModel(c ml.Config) *VisionModel {
+	return &VisionModel{
+		Transformer:       &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
+		GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},
+
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize: int64(c.Uint("vision.embedding_length")),
+			numHeads:   int64(c.Uint("vision.attention.head_count")),
+			numTiles:   int64(c.Uint("vision.max_num_tiles")),
+
+			imageSize: int(c.Uint("vision.image_size")),
+			patchSize: int(c.Uint("vision.patch_size")),
+
+			eps: c.Float("vision.attention.layer_norm_epsilon"),
+
+			intermediateLayersIndices: c.Uints("vision.intermediate_layers_indices"),
+		},
+	}
+}
--- a/model/mllama/process_image.go
+++ b/model/mllama/process_image.go
@ -0,0 +1,240 @@
+package mllama
+
+import (
+	"image"
+	"image/color"
+	"math"
+	"slices"
+
+	"golang.org/x/image/draw"
+
+	"github.com/ollama/ollama/ml"
+)
+
+type ImageProcessor struct {
+	imageSize, numChannels, maxNumTiles int
+}
+
+func newImageProcessor(c ml.Config) ImageProcessor {
+	return ImageProcessor{
+		imageSize:   int(c.Uint("vision.image_size")),
+		numChannels: int(c.Uint("vision.num_channels")),
+		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
+	}
+}
+
+func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
+	ratios := []image.Point{}
+
+	for w := range maxTiles {
+		for h := range maxTiles {
+			if (w+1)*(h+1) <= maxTiles {
+				ratios = append(ratios, image.Point{w + 1, h + 1})
+			}
+		}
+	}
+
+	return ratios
+}
+
+func (p *ImageProcessor) clip(a, a_min, a_max int) int {
+	if a < a_min {
+		return a_min
+	} else if a > a_max {
+		return a_max
+	}
+
+	return a
+}
+
+func (p *ImageProcessor) getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
+	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
+	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
+
+	scaleWidth := float64(targetWidth) / float64(imageSize.X)
+	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
+
+	var w, h int
+
+	if scaleWidth < scaleHeight {
+		w = targetWidth
+		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
+	} else {
+		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
+		h = targetHeight
+	}
+
+	return image.Point{w, h}
+}
+
+func (p *ImageProcessor) getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
+	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
+	possibleCanvasSizes := []image.Point{}
+	for _, pta := range possibleTileArrangements {
+		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
+	}
+
+	scales := []float64{}
+
+	for _, pcs := range possibleCanvasSizes {
+		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
+		scaleWidth := float64(pcs.X) / float64(imageSize.X)
+
+		if scaleWidth > scaleHeight {
+			scales = append(scales, scaleHeight)
+		} else {
+			scales = append(scales, scaleWidth)
+		}
+	}
+
+	var minUpscale float64
+	var maxDownscale float64
+	var upscale bool
+
+	for _, s := range scales {
+		if s > 1.0 {
+			upscale = true
+			if minUpscale == 0 {
+				minUpscale = s
+			} else {
+				minUpscale = math.Min(minUpscale, s)
+			}
+		} else {
+			maxDownscale = math.Max(maxDownscale, s)
+		}
+	}
+
+	selectedScale := maxDownscale
+	if upscale {
+		selectedScale = minUpscale
+	}
+
+	var selectedCanvas image.Point
+	for n, pcs := range possibleCanvasSizes {
+		if scales[n] == selectedScale {
+			// choose the smallest possible canvas
+			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
+				selectedCanvas = pcs
+			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
+				selectedCanvas = pcs
+			}
+		}
+	}
+	return selectedCanvas
+}
+
+func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
+	b := img.Bounds()
+	width := b.Max.X - b.Min.X
+	height := b.Max.Y - b.Min.Y
+	tileHeight := height / numTilesSize.Y
+	tileWidth := width / numTilesSize.X
+
+	images := []image.Image{}
+
+	for h := range numTilesSize.Y {
+		for w := range numTilesSize.X {
+			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
+			images = append(images, img.(interface {
+				SubImage(image.Rectangle) image.Image
+			}).SubImage(rect))
+		}
+	}
+
+	return images
+}
+
+// remove the "alpha" channel by drawing over a prefilled image
+//
+// remove the "alpha" channel by drawing over a prefilled image
+//
+//nolint:unused
+func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
+	dst := image.NewRGBA(img.Bounds())
+
+	white := color.RGBA{255, 255, 255, 255}
+	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
+	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
+
+	return dst
+}
+
+func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
+	b := img.Bounds()
+	tileSize := outputSize.Y
+
+	canvasSize := p.getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
+	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
+	newSize := p.getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
+
+	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
+
+	// scaling choices:
+	//   NearestNeighbor	fast, blocky output
+	//   ApproxBiLinear	fast, medium quality
+	//   BiLinear		slow, high quality
+	//   CatmullRom		very slow, very high quality
+	draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
+
+	return dst, aspectRatio
+}
+
+func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
+	paddedSize := image.Point{
+		X: outputSize.X * aspectRatio.X,
+		Y: outputSize.Y * aspectRatio.Y,
+	}
+
+	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
+	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
+
+	return dst
+}
+
+func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
+	subImages := p.splitToTiles(img, aspectRatio)
+
+	var pixelVals []float32
+
+	for _, subImg := range subImages {
+		bounds := subImg.Bounds()
+		var rVals, gVals, bVals []float32
+		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
+			for x := bounds.Min.X; x < bounds.Max.X; x++ {
+				c := subImg.At(x, y)
+				r, g, b, _ := c.RGBA()
+				rVal := float32(r>>8) / 255.0
+				gVal := float32(g>>8) / 255.0
+				bVal := float32(b>>8) / 255.0
+
+				rVal = (rVal - mean[0]) / std[0]
+				gVal = (gVal - mean[1]) / std[1]
+				bVal = (bVal - mean[2]) / std[2]
+
+				rVals = append(rVals, rVal)
+				gVals = append(gVals, gVal)
+				bVals = append(bVals, bVal)
+			}
+		}
+		pixelVals = append(pixelVals, rVals...)
+		pixelVals = append(pixelVals, gVals...)
+		pixelVals = append(pixelVals, bVals...)
+	}
+
+	return pixelVals
+}
+
+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
+	outputSize := image.Point{p.imageSize, p.imageSize}
+
+	// clip values
+	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
+	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
+
+	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
+	newImage = p.pad(newImage, outputSize, aspectRatio)
+
+	data := p.pack(newImage, aspectRatio, mean, std)
+	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
+	return data, aspectRatioIndex, nil
+}
--- a/model/mllama/process_text.go
+++ b/model/mllama/process_text.go
@ -0,0 +1,25 @@
+package mllama
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+)
+
+type TextProcessor struct {
+	model.BytePairEncoding
+}
+
+func newTextProcessor(c ml.Config) TextProcessor {
+	return TextProcessor{
+		BytePairEncoding: model.BytePairEncoding{
+			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			Vocabulary: &model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Uints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
+			},
+		},
+	}
+}
--- a/model/mllama/process_text_test.go
+++ b/model/mllama/process_text_test.go
@ -0,0 +1,82 @@
+package mllama
+
+import (
+	"encoding/json"
+	"errors"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+
+	"github.com/ollama/ollama/model"
+)
+
+func TestProcessText(t *testing.T) {
+	ours, err := model.New(filepath.Join("testdata", "model.bin"))
+	if errors.Is(err, os.ErrNotExist) {
+		t.Skip("no model.bin")
+	} else if err != nil {
+		t.Fatal(err)
+	}
+
+	t.Run("decode", func(t *testing.T) {
+		f, err := os.Open(filepath.Join("testdata", "theirs.json"))
+		if errors.Is(err, os.ErrNotExist) {
+			t.Skip("no theirs.json")
+		} else if err != nil {
+			t.Fatal(err)
+		}
+		defer f.Close()
+
+		var theirs [][]byte
+		if err := json.NewDecoder(f).Decode(&theirs); err != nil {
+			t.Fatal(err)
+		}
+
+		for id := range theirs {
+			ids := []int32{int32(id)}
+			s, err := ours.(model.TextProcessor).Decode(ids)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if diff := cmp.Diff(string(theirs[id]), s); diff != "" {
+				t.Errorf("%d no match (-theirs +ours):\n%s", id, diff)
+			}
+		}
+	})
+
+	t.Run("encode", func(t *testing.T) {
+		f, err := os.Open(filepath.Join("..", "testdata", "inputs.json"))
+		if errors.Is(err, os.ErrNotExist) {
+			t.Skip("no inputs.json")
+		} else if err != nil {
+			t.Fatal(err)
+		}
+		defer f.Close()
+
+		var inputs []struct {
+			Values []byte  `json:"base64"`
+			IDs    []int32 `json:"ids"`
+		}
+
+		if err := json.NewDecoder(f).Decode(&inputs); err != nil {
+			t.Fatal(err)
+		}
+
+		for i, input := range inputs {
+			t.Run(strconv.Itoa(i), func(t *testing.T) {
+				ids, err := ours.(model.TextProcessor).Encode(string(input.Values))
+				if err != nil {
+					t.Fatal(err)
+				}
+
+				if diff := cmp.Diff(input.IDs, ids); diff != "" {
+					t.Errorf("%s: no match (-theirs +ours):\n%s", input.Values, diff)
+				}
+			})
+		}
+	})
+}
--- a/model/mllama/testdata/model.bin
+++ b/model/mllama/testdata/model.bin
@ -0,0 +1 @@
+/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf
--- a/model/mllama/testdata/theirs.json
+++ b/model/mllama/testdata/theirs.json
--- a/model/model.go
+++ b/model/model.go
@ -0,0 +1,228 @@
+package model
+
+import (
+	"fmt"
+	"image"
+	_ "image/jpeg"
+	_ "image/png"
+	"log/slog"
+	"os"
+	"reflect"
+	"strconv"
+	"strings"
+
+	_ "golang.org/x/image/bmp"
+	_ "golang.org/x/image/tiff"
+	_ "golang.org/x/image/webp"
+
+	"github.com/ollama/ollama/cache"
+	"github.com/ollama/ollama/ml"
+	_ "github.com/ollama/ollama/ml/backend"
+)
+
+type Cache struct {
+	cache.Cache
+	cache.Options
+}
+
+func (c Cache) Sub(i int) Cache {
+	if c.Cache != nil {
+		return Cache{
+			Cache:   c.Cache.Sub(i),
+			Options: c.Options,
+		}
+	}
+
+	return c
+}
+
+func (c Cache) Put(ctx ml.Context, key, value ml.Tensor, opts cache.Options) (ml.Tensor, ml.Tensor) {
+	if c.Cache != nil {
+		return c.Cache.Put(ctx, key, value, opts)
+	}
+
+	return key, value
+}
+
+type Options struct {
+	inputs []int32
+
+	Offset int
+
+	Images []image.Image
+
+	Cache
+}
+
+func (opts Options) Inputs() []int32 {
+	return opts.inputs[opts.Offset:]
+}
+
+func (opts Options) Positions() []int32 {
+	positions := make([]int32, len(opts.inputs)-opts.Offset)
+	for i := range positions {
+		positions[i] = int32(opts.Offset + i)
+	}
+
+	return positions
+}
+
+type OptionsFunc func(Model, *Options)
+
+func WithInputIDs(ids []int32) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.inputs = ids
+	}
+}
+
+func WithOffset(offset int) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.Offset = offset
+		opts.Cache.Position = offset
+	}
+}
+
+func WithImage(img image.Image) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.Images = append(opts.Images, img)
+	}
+}
+
+func WithCache(c cache.Cache) OptionsFunc {
+	return func(m Model, opts *Options) {
+		opts.Cache = Cache{
+			Cache: c,
+			Options: cache.Options{
+				Position: opts.Offset,
+			},
+		}
+	}
+}
+
+type Base struct {
+	b ml.Backend
+}
+
+func (m *Base) Backend() ml.Backend {
+	return m.b
+}
+
+func (m *Base) SetBackend(b ml.Backend) {
+	m.b = b
+}
+
+type Model interface {
+	Forward(ml.Context, Options) (ml.Tensor, error)
+
+	Backend() ml.Backend
+	SetBackend(ml.Backend)
+}
+
+var models = make(map[string]func(ml.Config) (Model, error))
+
+func Register(name string, f func(ml.Config) (Model, error)) {
+	if _, ok := models[name]; ok {
+		panic("model: model already registered")
+	}
+
+	models[name] = f
+}
+
+func New(s string) (Model, error) {
+	r, err := os.Open(s)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	b, err := ml.NewBackend(r)
+	if err != nil {
+		return nil, err
+	}
+
+	arch := b.Config().Architecture()
+	f, ok := models[arch]
+	if !ok {
+		return nil, fmt.Errorf("unsupported model architecture %q", arch)
+	}
+
+	m, err := f(b.Config())
+	if err != nil {
+		return nil, err
+	}
+
+	if err := loadTensors(b, m); err != nil {
+		return nil, err
+	}
+
+	m.SetBackend(b)
+	return m, nil
+}
+
+var mlTensorType = reflect.TypeOf((*ml.Tensor)(nil)).Elem()
+
+func loadTensors(b ml.Backend, m any, tensorPath ...string) error {
+	t := reflect.TypeOf(m)
+	v := reflect.ValueOf(m)
+
+	if t.Kind() == reflect.Pointer {
+		t = t.Elem()
+		v = v.Elem()
+	}
+
+	if t.Kind() == reflect.Interface {
+		return loadTensors(b, v.Interface(), tensorPath...)
+	}
+
+	for i := range t.NumField() {
+		f := v.Field(i)
+		fullTensorPath := tensorPath
+		if tag := t.Field(i).Tag.Get("ggml"); tag != "" {
+			tensorName, _, _ := strings.Cut(tag, ",")
+			fullTensorPath = append(tensorPath, tensorName)
+		}
+
+		if !f.CanSet() {
+			continue
+		}
+
+		if f.Kind() == reflect.Ptr && f.IsNil() {
+			f.Set(reflect.New(f.Type().Elem()))
+		} else if f.Kind() == reflect.Interface && f.IsNil() && f.Type().Implements(mlTensorType) {
+			if tensor := b.Get(strings.Join(fullTensorPath, ".")); tensor != nil {
+				f.Set(reflect.ValueOf(tensor))
+				slog.Debug("loaded tensor", "kind", f.Elem().Type(), "", f.Interface())
+			}
+		}
+
+		if r := reflect.Indirect(f); r.Kind() == reflect.Struct {
+			if err := loadTensors(b, f.Interface(), fullTensorPath...); err != nil {
+				return err
+			}
+		} else if r.Kind() == reflect.Slice {
+			for i := range r.Len() {
+				if err := loadTensors(b, f.Index(i).Addr().Interface(), append(fullTensorPath, strconv.Itoa(i))...); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+func Forward(m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
+	var opts Options
+	for _, optsFunc := range optsFuncs {
+		optsFunc(m, &opts)
+	}
+
+	ctx := m.Backend().NewContext()
+	t, err := m.Forward(ctx, opts)
+	if err != nil {
+		return nil, err
+	}
+	defer ctx.Close()
+
+	return ctx.Compute(t), nil
+}
--- a/model/process_text.go
+++ b/model/process_text.go
@ -0,0 +1,311 @@
+package model
+
+import (
+	"cmp"
+	"log/slog"
+	"strings"
+	"sync"
+
+	"github.com/dlclark/regexp2"
+	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
+)
+
+type Special int32
+
+const (
+	SpecialBOS Special = iota
+	SpecialEOS
+)
+
+type TextProcessor interface {
+	Encode(string) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(uint32, Special) bool
+}
+
+type Vocabulary struct {
+	Values []string
+	Types  []uint32
+	Scores []uint32
+	Merges []string
+
+	BOS, EOS uint32
+
+	specialOnce sync.Once
+	special     []string
+
+	valuesOnce sync.Once
+	values     map[string]int32
+
+	mergeOnce sync.Once
+	merge     map[string]int32
+}
+
+func (v *Vocabulary) Is(id uint32, special Special) bool {
+	switch special {
+	case SpecialBOS:
+		return id == v.BOS
+	case SpecialEOS:
+		return id == v.EOS
+	default:
+		return false
+	}
+}
+
+func (v *Vocabulary) Encode(s string) int32 {
+	v.valuesOnce.Do(func() {
+		v.values = make(map[string]int32, len(v.Values))
+		for i, value := range v.Values {
+			v.values[value] = int32(i)
+		}
+	})
+
+	if id, ok := v.values[s]; ok {
+		return id
+	}
+
+	return -1
+}
+
+func (v *Vocabulary) Decode(id int32) string {
+	return v.Values[id]
+}
+
+func (v *Vocabulary) SpecialVocabulary() []string {
+	v.specialOnce.Do(func() {
+		for i := range v.Values {
+			if v.Types[i] == 3 {
+				v.special = append(v.special, v.Values[i])
+			}
+		}
+	})
+
+	return v.special
+}
+
+func (v *Vocabulary) Merge(left, right string) int {
+	v.mergeOnce.Do(func() {
+		v.merge = make(map[string]int32, len(v.Merges))
+		for i, merge := range v.Merges {
+			v.merge[merge] = int32(i)
+		}
+	})
+
+	if id, ok := v.merge[left+" "+right]; ok {
+		return int(id)
+	}
+
+	return -1
+}
+
+type BytePairEncoding struct {
+	Pretokenizer string
+
+	*Vocabulary
+}
+
+func (bpe BytePairEncoding) split(s string) ([]string, error) {
+	re, err := regexp2.Compile(bpe.Pretokenizer, regexp2.Unicode|regexp2.RE2)
+	if err != nil {
+		return nil, err
+	}
+
+	var matches []string
+	for m, _ := re.FindStringMatch(s); m != nil; m, _ = re.FindNextMatch(m) {
+		matches = append(matches, m.String())
+	}
+
+	return matches, nil
+}
+
+// fragment is a string fragment and their corresponding token IDs
+type fragment struct {
+	value string
+	ids   []int32
+}
+
+// pair is a pair of runes and its rank
+type pair struct {
+	a, b  int
+	rank  int
+	value string
+}
+
+type merge struct {
+	p, n  int
+	runes []rune
+}
+
+func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
+	fragments := []fragment{{value: s}}
+	for _, special := range bpe.Vocabulary.SpecialVocabulary() {
+		// TODO: process special tokens concurrently
+		id := bpe.Vocabulary.Encode(special)
+		for i := 0; i < len(fragments); i++ {
+			frag := fragments[i]
+			if len(frag.ids) > 0 {
+				continue
+			}
+
+			var middle []fragment
+			switch i := strings.Index(frag.value, special); {
+			case i < 0:
+				middle = append(middle, frag)
+			case i > 0:
+				middle = append(middle, fragment{value: frag.value[:i]})
+				fallthrough
+			default:
+				middle = append(middle, fragment{value: special, ids: []int32{id}})
+				if rest := frag.value[i+len(special):]; rest != "" {
+					middle = append(middle, fragment{value: rest})
+				}
+			}
+
+			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
+		}
+	}
+
+	ids := make([]int32, 0, len(fragments))
+	for _, frag := range fragments {
+		if len(frag.ids) > 0 {
+			ids = append(ids, frag.ids...)
+			slog.Debug("encoded", "text", frag.value, "ids", frag.ids, "special", true)
+			continue
+		}
+
+		// split fragment using pretokenizer
+		splits, err := bpe.split(frag.value)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, split := range splits {
+			// TODO: process splits concurrently
+			var sb strings.Builder
+			for _, b := range []byte(split) {
+				r := rune(b)
+				switch {
+				case r == 0x00ad:
+					r = 0x0143
+				case r <= 0x0020:
+					r = r + 0x0100
+				case r >= 0x007e && r <= 0x00a0:
+					r = r + 0x00a2
+				}
+
+				sb.WriteRune(r)
+			}
+
+			if id := bpe.Vocabulary.Encode(sb.String()); id >= 0 {
+				ids = append(ids, id)
+				slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
+				continue
+			}
+
+			runes := []rune(sb.String())
+			merges := make([]merge, len(runes))
+			for i := range runes {
+				merges[i] = merge{
+					p:     i - 1,
+					n:     i + 1,
+					runes: []rune{runes[i]},
+				}
+			}
+
+			pairwise := func(a, b int) *pair {
+				if a < 0 || b >= len(runes) {
+					return nil
+				}
+
+				left, right := string(merges[a].runes), string(merges[b].runes)
+				rank := bpe.Vocabulary.Merge(left, right)
+				if rank < 0 {
+					return nil
+				}
+
+				return &pair{
+					a:     a,
+					b:     b,
+					rank:  rank,
+					value: left + right,
+				}
+			}
+
+			pairs := heap.NewWith(func(i, j *pair) int {
+				return cmp.Compare(i.rank, j.rank)
+			})
+
+			for i := range len(runes) - 1 {
+				if pair := pairwise(i, i+1); pair != nil {
+					pairs.Push(pair)
+				}
+			}
+
+			for !pairs.Empty() {
+				pair, _ := pairs.Pop()
+
+				left, right := merges[pair.a], merges[pair.b]
+				if len(left.runes) <= 0 || len(right.runes) <= 0 ||
+					string(left.runes)+string(right.runes) != pair.value {
+					continue
+				}
+
+				merges[pair.a].runes = append(left.runes, right.runes...)
+				merges[pair.b].runes = nil
+
+				merges[pair.a].n = right.n
+				if right.n < len(merges) {
+					merges[right.n].p = pair.a
+				}
+
+				if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
+					pairs.Push(pair)
+				}
+
+				if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
+					pairs.Push(pair)
+				}
+			}
+
+			for _, merge := range merges {
+				if len(merge.runes) > 0 {
+					// TODO: handle the edge case where the rune isn't in the vocabulary
+					if id := bpe.Vocabulary.Encode(string(merge.runes)); id >= 0 {
+						ids = append(ids, id)
+						slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
+					}
+				}
+			}
+		}
+	}
+
+	return ids, nil
+}
+
+func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
+	var sb strings.Builder
+	for _, id := range ids {
+		for _, r := range bpe.Vocabulary.Decode(id) {
+			switch {
+			case r == 0x0100:
+				// this produces 0x00 aka NULL
+				continue
+			case r == 0x0143:
+				r = 0x00ad
+			case r > 0x0100 && r <= 0x0120:
+				r = r - 0x0100
+			case r > 0x0120 && r <= 0x0142:
+				r = r - 0x00a2
+			}
+
+			// NOTE: not using WriteRune here because it writes the UTF-8
+			// encoding of the rune which is _not_ what we want
+			if err := sb.WriteByte(byte(r)); err != nil {
+				return "", err
+			}
+		}
+	}
+
+	slog.Debug("decoded", "ids", ids, "text", sb.String())
+	return sb.String(), nil
+}
--- a/model/testdata/inputs.json
+++ b/model/testdata/inputs.json
@ -0,0 +1,588 @@
+[
+    {
+        "base64": "aWVkIDQgwr0gbW9udGhz",
+        "ids": [
+            1142,
+            220,
+            19,
+            220,
+            27154,
+            4038
+        ]
+    },
+    {
+        "base64": "RsO8aHJlcg==",
+        "ids": [
+            37,
+            51853,
+            261
+        ]
+    },
+    {
+        "base64": "",
+        "ids": []
+    },
+    {
+        "base64": "IA==",
+        "ids": [
+            220
+        ]
+    },
+    {
+        "base64": "ICA=",
+        "ids": [
+            256
+        ]
+    },
+    {
+        "base64": "ICAg",
+        "ids": [
+            262
+        ]
+    },
+    {
+        "base64": "CQ==",
+        "ids": [
+            197
+        ]
+    },
+    {
+        "base64": "Cg==",
+        "ids": [
+            198
+        ]
+    },
+    {
+        "base64": "Cgo=",
+        "ids": [
+            271
+        ]
+    },
+    {
+        "base64": "CgoK",
+        "ids": [
+            1432
+        ]
+    },
+    {
+        "base64": "CQo=",
+        "ids": [
+            1602
+        ]
+    },
+    {
+        "base64": "SGVsbG8gd29ybGQ=",
+        "ids": [
+            9906,
+            1917
+        ]
+    },
+    {
+        "base64": "IEhlbGxvIHdvcmxk",
+        "ids": [
+            22691,
+            1917
+        ]
+    },
+    {
+        "base64": "SGVsbG8gV29ybGQ=",
+        "ids": [
+            9906,
+            4435
+        ]
+    },
+    {
+        "base64": "IEhlbGxvIFdvcmxk",
+        "ids": [
+            22691,
+            4435
+        ]
+    },
+    {
+        "base64": "IEhlbGxvIFdvcmxkIQ==",
+        "ids": [
+            22691,
+            4435,
+            0
+        ]
+    },
+    {
+        "base64": "SGVsbG8sIHdvcmxkIQ==",
+        "ids": [
+            9906,
+            11,
+            1917,
+            0
+        ]
+    },
+    {
+        "base64": "IEhlbGxvLCB3b3JsZCE=",
+        "ids": [
+            22691,
+            11,
+            1917,
+            0
+        ]
+    },
+    {
+        "base64": "IHRoaXMgaXMg8J+mmS5jcHA=",
+        "ids": [
+            420,
+            374,
+            11410,
+            99,
+            247,
+            13,
+            11055
+        ]
+    },
+    {
+        "base64": "dzA0OCA3dHVpamsgZHNkZmh1",
+        "ids": [
+            86,
+            23904,
+            220,
+            22,
+            83,
+            2005,
+            42908,
+            11729,
+            3013,
+            17156
+        ]
+    },
+    {
+        "base64": "0L3QtdGJ0L4g0L3QsCDQkdGK0LvQs9Cw0YDRgdC60Lg=",
+        "ids": [
+            79862,
+            102118,
+            13373,
+            64571,
+            34694,
+            3114,
+            112203,
+            80112
+        ]
+    },
+    {
+        "base64": "4Z6A4Z624Z6T4Z+L4Z6P4Z+C4Z6W4Z634Z6f4Z+B4Z6f4Z6i4Z624Z6F4Z6B4Z6b4Z6F4Z+B4Z6J",
+        "ids": [
+            21549,
+            222,
+            98629,
+            241,
+            45358,
+            233,
+            21549,
+            237,
+            45358,
+            224,
+            21549,
+            244,
+            21549,
+            115,
+            21549,
+            253,
+            45358,
+            223,
+            21549,
+            253,
+            21549,
+            95,
+            98629,
+            227,
+            21549,
+            223,
+            21549,
+            249,
+            21549,
+            227,
+            45358,
+            223,
+            21549,
+            231
+        ]
+    },
+    {
+        "base64": "8J+agCAobm9ybWFsKSDwn5i24oCN8J+Mq++4jyAobXVsdGlwbGUgZW1vamlzIGNvbmNhdGVuYXRlZCkg4pyFIChvbmx5IGVtb2ppIHRoYXQgaGFzIGl0cyBvd24gdG9rZW4p",
+        "ids": [
+            9468,
+            248,
+            222,
+            320,
+            8416,
+            8,
+            27623,
+            114,
+            102470,
+            9468,
+            234,
+            104,
+            31643,
+            320,
+            36773,
+            100166,
+            98634,
+            8,
+            26602,
+            227,
+            320,
+            3323,
+            43465,
+            430,
+            706,
+            1202,
+            1866,
+            4037,
+            8
+        ]
+    },
+    {
+        "base64": "SGVsbG8=",
+        "ids": [
+            9906
+        ]
+    },
+    {
+        "base64": "IEhlbGxv",
+        "ids": [
+            22691
+        ]
+    },
+    {
+        "base64": "ICBIZWxsbw==",
+        "ids": [
+            220,
+            22691
+        ]
+    },
+    {
+        "base64": "ICAgSGVsbG8=",
+        "ids": [
+            256,
+            22691
+        ]
+    },
+    {
+        "base64": "ICAgIEhlbGxv",
+        "ids": [
+            262,
+            22691
+        ]
+    },
+    {
+        "base64": "ICAgIEhlbGxvCiAgICBIZWxsbw==",
+        "ids": [
+            262,
+            22691,
+            198,
+            262,
+            22691
+        ]
+    },
+    {
+        "base64": "ICg=",
+        "ids": [
+            320
+        ]
+    },
+    {
+        "base64": "CiA9",
+        "ids": [
+            198,
+            284
+        ]
+    },
+    {
+        "base64": "JyBlcmE=",
+        "ids": [
+            6,
+            11639
+        ]
+    },
+    {
+        "base64": "SGVsbG8sIHknYWxsISBIb3cgYXJlIHlvdSDwn5iBID/miJHmg7PlnKhhcHBsZeW3peS9nDEzMTQxNTHlpKnvvZ4=",
+        "ids": [
+            9906,
+            11,
+            379,
+            65948,
+            0,
+            2650,
+            527,
+            499,
+            27623,
+            223,
+            949,
+            37046,
+            101067,
+            19000,
+            23182,
+            102301,
+            9263,
+            18136,
+            16,
+            36827,
+            21909
+        ]
+    },
+    {
+        "base64": "ISEhISEh",
+        "ids": [
+            17523,
+            3001
+        ]
+    },
+    {
+        "base64": "Mw==",
+        "ids": [
+            18
+        ]
+    },
+    {
+        "base64": "MzM=",
+        "ids": [
+            1644
+        ]
+    },
+    {
+        "base64": "MzMz",
+        "ids": [
+            8765
+        ]
+    },
+    {
+        "base64": "MzMzMw==",
+        "ids": [
+            8765,
+            18
+        ]
+    },
+    {
+        "base64": "MzMzMzM=",
+        "ids": [
+            8765,
+            1644
+        ]
+    },
+    {
+        "base64": "MzMzMzMz",
+        "ids": [
+            8765,
+            8765
+        ]
+    },
+    {
+        "base64": "MzMzMzMzMw==",
+        "ids": [
+            8765,
+            8765,
+            18
+        ]
+    },
+    {
+        "base64": "MzMzMzMzMzM=",
+        "ids": [
+            8765,
+            8765,
+            1644
+        ]
+    },
+    {
+        "base64": "MzMzMzMzMzMz",
+        "ids": [
+            8765,
+            8765,
+            8765
+        ]
+    },
+    {
+        "base64": "Q+G7rWEgVmnhu4d0",
+        "ids": [
+            34,
+            91163,
+            11655,
+            26298,
+            83
+        ]
+    },
+    {
+        "base64": "IGRpc2NhcmRz",
+        "ids": [
+            2624,
+            2402
+        ]
+    },
+    {
+        "base64": "CiAKCiAKCgogCSAJCSAJCiAgCiAgIAogICAgCiAgICAgCvCfmoAgKG5vcm1hbCkg8J+YtuKAjfCfjKvvuI8gKG11bHRpcGxlIGVtb2ppcyBjb25jYXRlbmF0ZWQpIOKchSDwn6aZ8J+mmSAzIDMzIDMzMyAzMzMzIDMzMzMzIDMzMzMzMyAzMzMzMzMzIDMzMzMzMzMzIDMuMyAzLi4zIDMuLi4zIOGegOGetuGek+Gfi+Gej+GfguGeluGet+Gen+GfgeGen+GeouGetuGehfCfmIEgP+aIkeaDs+WcqGFwcGxl5bel5L2cMTMxNDE1MeWkqe+9niAtLS0tLS09PT09PT09INC90LXRidC+INC90LAg0JHRitC70LPQsNGA0YHQutC4ICcnJycnJ2BgYGBgYGAiIiIiLi4uLi4uISEhISEhPz8/Pz8/IEkndmUgYmVlbiAndG9sZCBoZSdzIHRoZXJlLCAnUkUgeW91IHN1cmU/ICdNIG5vdCBzdXJlIEknbGwgbWFrZSBpdCwgJ0QgeW91IGxpa2Ugc29tZSB0ZWE/IFdlJ1ZlIGEnbEw=",
+        "ids": [
+            198,
+            4815,
+            15073,
+            66597,
+            8004,
+            1602,
+            2355,
+            79772,
+            11187,
+            9468,
+            248,
+            222,
+            320,
+            8416,
+            8,
+            27623,
+            114,
+            102470,
+            9468,
+            234,
+            104,
+            31643,
+            320,
+            36773,
+            100166,
+            98634,
+            8,
+            26602,
+            227,
+            11410,
+            99,
+            247,
+            9468,
+            99,
+            247,
+            220,
+            18,
+            220,
+            1644,
+            220,
+            8765,
+            220,
+            8765,
+            18,
+            220,
+            8765,
+            1644,
+            220,
+            8765,
+            8765,
+            220,
+            8765,
+            8765,
+            18,
+            220,
+            8765,
+            8765,
+            1644,
+            220,
+            18,
+            13,
+            18,
+            220,
+            18,
+            497,
+            18,
+            220,
+            18,
+            1131,
+            18,
+            220,
+            21549,
+            222,
+            98629,
+            241,
+            45358,
+            233,
+            21549,
+            237,
+            45358,
+            224,
+            21549,
+            244,
+            21549,
+            115,
+            21549,
+            253,
+            45358,
+            223,
+            21549,
+            253,
+            21549,
+            95,
+            98629,
+            227,
+            76460,
+            223,
+            949,
+            37046,
+            101067,
+            19000,
+            23182,
+            102301,
+            9263,
+            18136,
+            16,
+            36827,
+            21909,
+            56560,
+            54337,
+            19175,
+            102118,
+            13373,
+            64571,
+            34694,
+            3114,
+            112203,
+            80112,
+            3436,
+            106451,
+            14196,
+            14196,
+            74694,
+            3089,
+            3089,
+            29249,
+            17523,
+            3001,
+            27708,
+            7801,
+            358,
+            3077,
+            1027,
+            364,
+            83,
+            820,
+            568,
+            596,
+            1070,
+            11,
+            364,
+            793,
+            499,
+            2771,
+            30,
+            364,
+            44,
+            539,
+            2771,
+            358,
+            3358,
+            1304,
+            433,
+            11,
+            364,
+            35,
+            499,
+            1093,
+            1063,
+            15600,
+            30,
+            1226,
+            6,
+            43712,
+            264,
+            64966,
+            43
+        ]
+    }
+]
--- a/sample/greedy.go
+++ b/sample/greedy.go
@ -0,0 +1,13 @@
+package sample
+
+import "gonum.org/v1/gonum/floats"
+
+type greedy struct{}
+
+func Greedy() Sampler {
+	return greedy{}
+}
+
+func (s greedy) Sample(t []float64) ([]float64, error) {
+	return []float64{float64(floats.MaxIdx(t))}, nil
+}
--- a/sample/sample.go
+++ b/sample/sample.go
@ -0,0 +1,74 @@
+package sample
+
+import (
+	"slices"
+
+	"gonum.org/v1/gonum/floats"
+	"gonum.org/v1/gonum/stat/sampleuv"
+)
+
+type Sampler interface {
+	Sample([]float64) ([]float64, error)
+}
+
+type Temperature float64
+
+func (s Temperature) Sample(t []float64) ([]float64, error) {
+	floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
+	return t, nil
+}
+
+type softmax struct{}
+
+func Softmax() Sampler {
+	return softmax{}
+}
+
+func (softmax) Sample(t []float64) ([]float64, error) {
+	return t, nil
+}
+
+type TopK int
+
+func (s TopK) Sample(t []float64) ([]float64, error) {
+	return t, nil
+}
+
+type TopP float32
+
+func (s TopP) Sample(t []float64) ([]float64, error) {
+	return t, nil
+}
+
+type MinP float32
+
+func (s MinP) Sample(t []float64) ([]float64, error) {
+	return t, nil
+}
+
+type weighed struct{}
+
+func Weighed() Sampler {
+	return weighed{}
+}
+
+func (s weighed) Sample(t []float64) ([]float64, error) {
+	w := sampleuv.NewWeighted(t, nil)
+	if v, ok := w.Take(); ok {
+		return []float64{float64(v)}, nil
+	}
+
+	return t, nil
+}
+
+func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
+	var err error
+	for _, sampler := range samplers {
+		floats, err = sampler.Sample(floats)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return floats, nil
+}
				`@ -0,0 +1 @@`
				`/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf`