image processing

2025-07-13 06:54:55 +02:00 · 2025-04-02 14:28:27 -07:00
parent ace6176af9
commit 733630a491
5 changed files with 266 additions and 92 deletions
--- a/model/models/qwen25vl/imageproc.go
+++ b/model/models/qwen25vl/imageproc.go
@ -1,74 +0,0 @@
-package qwen25vl
-
-import (
-	"fmt"
-	"image"
-	_ "image/jpeg"
-	_ "image/png"
-	"io"
-	"math"
-
-	"github.com/ollama/ollama/model/imageproc"
-)
-
-const (
-	DefaultFactor    = 28
-	DefaultMinPixels = 56 * 56
-	DefaultMaxPixels = 14 * 14 * 4 * 1280
-)
-
-// smartResize calculates the size of the image to resize to based on the
-// factor, minPixels, and maxPixels.
-func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
-	// 1. Both dimensions of size are divisible by factor
-	// 2. The area of the image is between minPixels and maxPixels
-	// 3. The aspect ratio of the image is as close to 1:1 as possible
-
-	if size.Y < factor || size.X < factor {
-		panic("image is too small to resize")
-	} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
-		panic("aspect ratio must be less than 200:1")
-	}
-
-	f := float64(factor)
-	width := float64(size.X)
-	height := float64(size.Y)
-
-	xBar := math.Round(width/f) * f
-	yBar := math.Round(height/f) * f
-
-	if xBar*yBar > float64(maxPixels) {
-		beta := math.Sqrt(height * width / float64(maxPixels))
-		xBar = math.Floor(width/beta/f) * f
-		yBar = math.Floor(height/beta/f) * f
-	} else if xBar*yBar < float64(minPixels) {
-		beta := math.Sqrt(float64(minPixels) / (height * width))
-		xBar = math.Ceil(width*beta/f) * f
-		yBar = math.Ceil(height*beta/f) * f
-	}
-
-	return image.Point{int(xBar), int(yBar)}
-}
-
-func resizeImage(img image.Image, format string, size image.Point) image.Image {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-
-	return imageproc.Resize(img, size, imageproc.ResizeBilinear)
-}
-
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-
-	size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
-	img = resizeImage(img, format, size)
-
-	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
-
-	opts := map[string]any{}
-	return data, opts, nil
-}
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@ -1,6 +1,10 @@
 package qwen25vl

 import (
+	"bytes"
+	"fmt"
+	"image"
+
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
@ -13,7 +17,7 @@ type Model struct {
 	// *VisionModel         `gguf:"v,vision"`
 	// *MultiModalProjector `gguf:"mm"`

-	// ImageProcessor
+	ImageProcessor
 }

 // Implement MultimodalProcessor interface
@ -23,7 +27,7 @@ func New(c ml.Config) (model.Model, error) {
 	m := &Model{
 		TextModel: NewTextModel(c),
 		// VisionModel:         newVisionModel(c),
-		// ImageProcessor:      newImageProcessor(c),
+		ImageProcessor: newImageProcessor(c),
 		// MultiModalProjector: newMultiModalProjector(c),
 	}

@ -33,12 +37,102 @@ func New(c ml.Config) (model.Model, error) {
 }

 func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	// if len(m.VisionModel.Layers) == 0 {
+	// 	return nil, model.ErrNoVisionModel
+	// }
+
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	f32s, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, err
+	}
+
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.imageSize,
+		m.ImageProcessor.numChannels,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	fmt.Println("pixelValues", pixelValues)
+
 	return nil, nil
+
+	// visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
+	// visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
+	// return visionOutputs, nil
 }

 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-	return inputs, nil
+	var result []input.Input
+
+	// Get image token IDs from config
+	// imageToken := m.Config.Uint("image_token_id")
+	// visionStartToken := m.Config.Uint("vision_start_token_id")
+	// visionEndToken := m.Config.Uint("vision_end_token_id")
+	imageToken := 151655
+	visionStartToken := 151652
+	visionEndToken := 151653
+
+	// Get merge size from vision config
+	// mergeSize := m.Config.Uint("vision_config.spatial_merge_size")
+	// patchSize := m.Config.Uint("vision_config.spatial_patch_size")
+	// windowSize := m.Config.Uint("vision_config.window_size")
+	mergeSize := 2
+	patchSize := 14
+	windowSize := 112
+
+	// Calculate grid dimensions
+	// The total patches per dimension = window_size / patch_size
+	patchesPerDim := windowSize / patchSize
+	// Grid size after merging = patches per dimension / merge_size
+	gridSize := patchesPerDim / mergeSize
+
+	// Calculate tokens per grid
+	tokensPerGrid := gridSize * gridSize
+
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			// If not a multimodal input, add it to the result unchanged
+			result = append(result, inp)
+		} else if inp.Token == int32(imageToken) {
+			// This is an image token
+			inputMultimodal := inp.Multimodal.(ml.Tensor)
+
+			// Replace the image token with multiple placeholder tokens
+			// First add the vision start token
+			result = append(result, input.Input{Token: int32(visionStartToken)})
+
+			// Then add the multimodal tensor data at the first position
+			result = append(result,
+				input.Input{
+					Multimodal:     inputMultimodal,
+					MultimodalHash: inp.MultimodalHash,
+				})
+
+			// Then add the placeholder tokens for the remaining positions
+			// We subtract 1 from tokensPerGrid because we already added the first token
+			placeholders := tokensPerGrid - 1
+			for range int(placeholders) {
+				result = append(result, input.Input{Token: int32(imageToken)})
+			}
+
+			// Finally add the vision end token
+			result = append(result, input.Input{Token: int32(visionEndToken)})
+		} else {
+			// For any other token, just pass through
+			result = append(result, inp)
+		}
+	}
+
+	return result, nil
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
--- a/model/models/qwen25vl/model_test.go
+++ b/model/models/qwen25vl/model_test.go
@ -0,0 +1,59 @@
+package qwen25vl
+
+import (
+	"testing"
+
+	"github.com/ollama/ollama/ml/backend/ggml"
+	"github.com/ollama/ollama/model/input"
+)
+
+func TestPostTokenize(t *testing.T) {
+	// Set up test inputs
+	model := &Model{}
+	mockHash := uint64(12345678)
+
+	inputs := []input.Input{
+		{Token: 123}, // Regular token
+		{Token: 456}, // Regular token
+		{Token: 151655, Multimodal: &ggml.Tensor{}, MultimodalHash: mockHash}, // Image token
+		{Token: 789}, // Regular token
+	}
+
+	// Run the function being tested
+	result, err := model.PostTokenize(inputs)
+	if err != nil {
+		t.Fatalf("PostTokenize returned error: %v", err)
+	}
+
+	// Verify the actual length first
+	expectedLength := 21
+	if len(result) != expectedLength {
+		t.Fatalf("Result has wrong length: got %d, expected %d", len(result), expectedLength)
+	}
+
+	// Check key positions only
+	checkPositions := map[int]int32{
+		0:  123,    // First regular token
+		1:  456,    // Second regular token
+		2:  151652, // Vision start token
+		4:  151655, // First placeholder token
+		19: 151653, // Vision end token
+		20: 789,    // Final regular token
+	}
+
+	for pos, expectedToken := range checkPositions {
+		if pos >= len(result) {
+			t.Errorf("Position %d is out of bounds (result length: %d)", pos, len(result))
+			continue
+		}
+		if result[pos].Token != expectedToken {
+			t.Errorf("Position %d: expected token %d, got %d", pos, expectedToken, result[pos].Token)
+		}
+	}
+
+	// Check multimodal data is preserved
+	if result[3].MultimodalHash != mockHash {
+		t.Errorf("Multimodal hash not preserved: got %d, expected %d",
+			result[3].MultimodalHash, mockHash)
+	}
+}
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@ -0,0 +1,84 @@
+package qwen25vl
+
+import (
+	"image"
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+type ImageProcessor struct {
+	imageSize, numChannels       int
+	factor, minPixels, maxPixels int
+}
+
+func newImageProcessor(c ml.Config) ImageProcessor {
+	return ImageProcessor{
+		imageSize:   int(c.Uint("vision.image_size")),
+		numChannels: 3, // RGB channels
+		factor:      28,
+		minPixels:   56 * 56,
+		maxPixels:   14 * 14 * 4 * 1280,
+	}
+}
+
+// smartResize calculates the size of the image to resize to based on the
+// factor, minPixels, and maxPixels.
+func (p *ImageProcessor) smartResize(size image.Point) image.Point {
+	// 1. Both dimensions of size are divisible by factor
+	// 2. The area of the image is between minPixels and maxPixels
+	// 3. The aspect ratio of the image is as close to 1:1 as possible
+
+	if size.Y < p.factor || size.X < p.factor {
+		panic("image is too small to resize")
+	} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
+		panic("aspect ratio must be less than 200:1")
+	}
+
+	f := float64(p.factor)
+	width := float64(size.X)
+	height := float64(size.Y)
+
+	xBar := math.Round(width/f) * f
+	yBar := math.Round(height/f) * f
+
+	if xBar*yBar > float64(p.maxPixels) {
+		beta := math.Sqrt(height * width / float64(p.maxPixels))
+		xBar = math.Floor(width/beta/f) * f
+		yBar = math.Floor(height/beta/f) * f
+	} else if xBar*yBar < float64(p.minPixels) {
+		beta := math.Sqrt(float64(p.minPixels) / (height * width))
+		xBar = math.Ceil(width*beta/f) * f
+		yBar = math.Ceil(height*beta/f) * f
+	}
+
+	return image.Point{int(xBar), int(yBar)}
+}
+
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
+	// Detect PNG by checking for alpha channel
+	isPNG := false
+	if _, _, _, a := img.At(0, 0).RGBA(); a < 0xffff {
+		isPNG = true
+	}
+
+	size := p.smartResize(img.Bounds().Max)
+
+	// Composite PNG images to handle transparency
+	if isPNG {
+		img = imageproc.Composite(img)
+	}
+
+	// Resize the image
+	img = imageproc.Resize(img, size, imageproc.ResizeBilinear)
+
+	// Use CLIP normalization values
+	mean := [3]float32{0.48145466, 0.4578275, 0.40821073} // CLIP mean values
+	std := [3]float32{0.26862954, 0.26130258, 0.27577711} // CLIP std values
+
+	// Normalize and get the data
+	data := imageproc.Normalize(img, mean, std, true, true)
+
+	return data, nil
+}
--- a/model/models/qwen25vl/process_image_test.go
+++ b/model/models/qwen25vl/process_image_test.go
@ -1,9 +1,8 @@
 package qwen25vl

 import (
-	"bytes"
 	"image"
-	"image/png"
+	_ "image/jpeg" // Register JPEG decoder
 	"testing"
 )

@ -13,6 +12,15 @@ func TestSmartResize(t *testing.T) {
 		Expected  image.Point
 	}

+	// Create an image processor with default values
+	processor := ImageProcessor{
+		imageSize:   224, // Example value
+		numChannels: 3,
+		factor:      28,
+		minPixels:   56 * 56,
+		maxPixels:   14 * 14 * 4 * 1280,
+	}
+
 	cases := []smartResizeCase{
 		{
 			TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
@ -30,38 +38,41 @@ func TestSmartResize(t *testing.T) {

 	for _, c := range cases {
 		b := c.TestImage.Bounds().Max
-		actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
+		actual := processor.smartResize(b)
 		if actual != c.Expected {
 			t.Errorf("expected: %v, actual: %v", c.Expected, actual)
 		}
 	}
 }

-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
+func TestProcessImage(t *testing.T) {
+	type processImageCase struct {
 		TestImage   image.Image
 		ExpectedLen int
 	}

-	cases := []preprocessCase{
+	// Create an image processor with default values
+	processor := ImageProcessor{
+		imageSize:   224, // Example value
+		numChannels: 3,
+		factor:      28,
+		minPixels:   56 * 56,
+		maxPixels:   14 * 14 * 4 * 1280,
+	}
+
+	cases := []processImageCase{
 		{
 			TestImage:   image.NewRGBA(image.Rect(0, 0, 256, 256)),
-			ExpectedLen: 252 * 252 * 3 * 1,
+			ExpectedLen: 252 * 252 * 3,
 		},
 		{
 			TestImage:   image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
-			ExpectedLen: 980 * 980 * 3 * 1,
+			ExpectedLen: 980 * 980 * 3,
 		},
 	}

 	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		imgData, _, err := Preprocess(&buf)
+		imgData, err := processor.ProcessImage(c.TestImage)
 		if err != nil {
 			t.Fatalf("error processing: %q", err)
 		}