fix: Update llama.go to use mtmd instead of clip/llava

It's _very_ possible that this is broken! Branch: GraniteFour Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-09-08 21:32:13 +02:00 · 2025-06-24 17:48:31 -06:00
parent fa54a3cf3a
commit 3d70237fd1
2 changed files with 56 additions and 42 deletions
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -13,8 +13,7 @@ package llama
 #include <stdlib.h>
 #include "ggml.h"
 #include "llama.h"
-#include "clip.h"
-#include "llava.h"
+#include "mtmd.h"
 #include "gguf.h"

 #include "sampling_ext.h"
@@ -148,27 +147,23 @@ func (c *Context) Model() *Model {
 }

 func (c *Context) KvCacheSeqAdd(seqId int, p0 int, p1 int, delta int) {
-	C.llama_kv_self_seq_add(c.c, C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
+	C.llama_memory_seq_add(C.llama_get_memory(c.c), C.int(seqId), C.int(p0), C.int(p1), C.int(delta))
 }

 func (c *Context) KvCacheSeqRm(seqId int, p0 int, p1 int) bool {
-	return bool(C.llama_kv_self_seq_rm(c.c, C.int(seqId), C.int(p0), C.int(p1)))
+	return bool(C.llama_memory_seq_rm(C.llama_get_memory(c.c), C.int(seqId), C.int(p0), C.int(p1)))
 }

 func (c *Context) KvCacheSeqCp(srcSeqId int, dstSeqId int, p0 int, p1 int) {
-	C.llama_kv_self_seq_cp(c.c, C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
+	C.llama_memory_seq_cp(C.llama_get_memory(c.c), C.int(srcSeqId), C.int(dstSeqId), C.int(p0), C.int(p1))
 }

 func (c *Context) KvCacheClear() {
-	C.llama_kv_self_clear(c.c)
-}
-
-func (c *Context) KvCacheDefrag() {
-	C.llama_kv_self_defrag(c.c)
+	C.llama_memory_clear(C.llama_get_memory(c.c), true)
 }

 func (c *Context) KvCacheCanShift() bool {
-	return bool(C.llama_kv_self_can_shift(c.c))
+	return bool(C.llama_memory_can_shift(C.llama_get_memory(c.c)))
 }

 // Get the embeddings for a sequence id
@@ -460,52 +455,71 @@ func (m *Model) NEmbd() int {
 }

 // vision processing
-type ClipContext struct {
-	c *C.struct_clip_ctx
+type MtmdContext struct {
+	c *C.struct_mtmd_context
 }

-func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, error) {
+func NewMtmdContext(llamaContext *Context, modelPath string) (*MtmdContext, error) {
 	mp := C.CString(modelPath)
 	defer C.free(unsafe.Pointer(mp))
-	c := C.clip_model_load(mp, 1)
+	// TODO: Support non-default params
+	cp := C.mtmd_context_params_default()
+
+	// NOTE: The model and projector embedding lengths are checked during init
+	c := C.mtmd_init_from_file(mp, C.llama_get_model(llamaContext.c), cp)
 	if c == nil {
-		return nil, fmt.Errorf("unable to load clip model: %v", modelPath)
+		return nil, fmt.Errorf("unable to load mmtd model: %v", modelPath)
 	}

-	projEmbedSize := int(C.clip_n_mmproj_embd(c))
-	modelEmbedSize := llamaContext.Model().NEmbd()
-	if projEmbedSize != modelEmbedSize {
-		return nil, fmt.Errorf("projector embedding size (%d) does not match model (%d)", projEmbedSize, modelEmbedSize)
-	}
-
-	return &ClipContext{c: c}, nil
+	return &MtmdContext{c: c}, nil
 }

-func (c *ClipContext) Free() {
-	C.clip_free(c.c)
+func (c *MtmdContext) Free() {
+	C.mtmd_free(c.c)
 }

-func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
-	l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
-	if l == nil {
-		return nil, errors.New("unable to make llava embedding from image")
+func (c *MtmdContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
+	// Initialize the input chunks pointer
+	ic := C.mtmd_input_chunks_init()
+	defer C.mtmd_input_chunks_free(ic)
+
+	// Initialize an empty text prompt so we can tokenize
+	it := C.mtmd_input_text_init(C.mtmd_default_marker(), true, true)
+	defer C.mtmd_input_text_free(it)
+
+	// Initialize a bitmap with the image data
+	bm := C.mtmd_bitmap_init(C.uint32_t(len(data)/3), C.uint32_t(1), (*C.uchar)(unsafe.Pointer(&data[0])))
+	defer C.mtmd_bitmap_free(bm)
+
+	// Tokenize the image
+	if C.int32_t(0) != C.mtmd_tokenize(c.c, ic, it, &bm, 1) {
+		return nil, errors.New("unable to tokenize mtmd embedding from image")
+	}
+	nChunks := C.mtmd_input_chunks_size(ic)
+	if nChunks != 1 {
+		return nil, errors.New("image-only mtmd input tokenized to multiple chunks!")
+	}
+	chunk := C.mtmd_input_chunks_get(ic, 0)
+
+	// Encode the chunk
+	if C.int32_t(0) != C.mtmd_encode_chunk(c.c, chunk) {
+		return nil, errors.New("unable to encode mtmd image chunk")
 	}

-	numTokens := int(l.n_image_pos)
+	// Get the embedding
+	embd := C.mtmd_get_output_embd(c.c)
+
+	// Copy embeddings over to go slice
+	numTokens := int(C.mtmd_input_chunk_get_n_tokens(chunk))
 	numEmbed := llamaContext.Model().NEmbd()
-
-	s := unsafe.Slice((*float32)(l.embed), numEmbed*numTokens)
-
+	s := unsafe.Slice((*float32)(embd), numEmbed*numTokens)
 	embed := make([][]float32, numTokens)
 	rows := make([]float32, len(s))
 	copy(rows, s)
-
 	for i := range embed {
 		embed[i] = rows[i*numEmbed : (i+1)*numEmbed]
 	}

-	C.llava_image_embed_free(l)
-
 	return embed, nil
 }

--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -17,7 +17,7 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex

-	clip *llama.ClipContext
+	mtmd *llama.MtmdContext

 	// cache of images to embeddings
 	images    []imageCache
@@ -32,7 +32,7 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte

 	var c ImageContext
 	if arch == "clip" {
-		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
+		c.mtmd, err = llama.NewMtmdContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@@ -51,8 +51,8 @@ func (c *ImageContext) Free(modelPath string) {
 		return
 	}

-	if c.clip != nil {
-		c.clip.Free()
+	if c.mtmd != nil {
+		c.mtmd.Free()
 	}
 }

@@ -72,8 +72,8 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]f

 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.clip != nil {
-			embed, err = c.clip.NewEmbed(llamaContext, data)
+		if c.mtmd != nil {
+			embed, err = c.mtmd.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
 			}