diff --git a/model/models/qwen25vl/imageproc.go b/model/models/qwen25vl/imageproc.go deleted file mode 100644 index 136549767..000000000 --- a/model/models/qwen25vl/imageproc.go +++ /dev/null @@ -1,74 +0,0 @@ -package qwen25vl - -import ( - "fmt" - "image" - _ "image/jpeg" - _ "image/png" - "io" - "math" - - "github.com/ollama/ollama/model/imageproc" -) - -const ( - DefaultFactor = 28 - DefaultMinPixels = 56 * 56 - DefaultMaxPixels = 14 * 14 * 4 * 1280 -) - -// smartResize calculates the size of the image to resize to based on the -// factor, minPixels, and maxPixels. -func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point { - // 1. Both dimensions of size are divisible by factor - // 2. The area of the image is between minPixels and maxPixels - // 3. The aspect ratio of the image is as close to 1:1 as possible - - if size.Y < factor || size.X < factor { - panic("image is too small to resize") - } else if max(size.X, size.Y)/min(size.X, size.Y) > 200 { - panic("aspect ratio must be less than 200:1") - } - - f := float64(factor) - width := float64(size.X) - height := float64(size.Y) - - xBar := math.Round(width/f) * f - yBar := math.Round(height/f) * f - - if xBar*yBar > float64(maxPixels) { - beta := math.Sqrt(height * width / float64(maxPixels)) - xBar = math.Floor(width/beta/f) * f - yBar = math.Floor(height/beta/f) * f - } else if xBar*yBar < float64(minPixels) { - beta := math.Sqrt(float64(minPixels) / (height * width)) - xBar = math.Ceil(width*beta/f) * f - yBar = math.Ceil(height*beta/f) * f - } - - return image.Point{int(xBar), int(yBar)} -} - -func resizeImage(img image.Image, format string, size image.Point) image.Image { - if format == "png" { - img = imageproc.Composite(img) - } - - return imageproc.Resize(img, size, imageproc.ResizeBilinear) -} - -func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) { - img, format, err := image.Decode(imageData) - if err != nil { - return nil, nil, fmt.Errorf("failed to decode image: %w", err) - } - - size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) - img = resizeImage(img, format, size) - - data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true) - - opts := map[string]any{} - return data, opts, nil -} diff --git a/model/models/qwen25vl/model.go b/model/models/qwen25vl/model.go index 19f56d906..b05b449b0 100644 --- a/model/models/qwen25vl/model.go +++ b/model/models/qwen25vl/model.go @@ -1,6 +1,10 @@ package qwen25vl import ( + "bytes" + "fmt" + "image" + "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" @@ -13,7 +17,7 @@ type Model struct { // *VisionModel `gguf:"v,vision"` // *MultiModalProjector `gguf:"mm"` - // ImageProcessor + ImageProcessor } // Implement MultimodalProcessor interface @@ -23,7 +27,7 @@ func New(c ml.Config) (model.Model, error) { m := &Model{ TextModel: NewTextModel(c), // VisionModel: newVisionModel(c), - // ImageProcessor: newImageProcessor(c), + ImageProcessor: newImageProcessor(c), // MultiModalProjector: newMultiModalProjector(c), } @@ -33,12 +37,102 @@ func New(c ml.Config) (model.Model, error) { } func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { + // if len(m.VisionModel.Layers) == 0 { + // return nil, model.ErrNoVisionModel + // } + + image, _, err := image.Decode(bytes.NewReader(multimodalData)) + if err != nil { + return nil, err + } + + f32s, err := m.ImageProcessor.ProcessImage(image) + if err != nil { + return nil, err + } + + pixelValues, err := ctx.Input().FromFloatSlice(f32s, + m.ImageProcessor.imageSize, + m.ImageProcessor.imageSize, + m.ImageProcessor.numChannels, + ) + if err != nil { + return nil, err + } + + fmt.Println("pixelValues", pixelValues) + return nil, nil + + // visionOutputs := m.VisionModel.Forward(ctx, pixelValues) + // visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps) + // return visionOutputs, nil } // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { - return inputs, nil + var result []input.Input + + // Get image token IDs from config + // imageToken := m.Config.Uint("image_token_id") + // visionStartToken := m.Config.Uint("vision_start_token_id") + // visionEndToken := m.Config.Uint("vision_end_token_id") + imageToken := 151655 + visionStartToken := 151652 + visionEndToken := 151653 + + // Get merge size from vision config + // mergeSize := m.Config.Uint("vision_config.spatial_merge_size") + // patchSize := m.Config.Uint("vision_config.spatial_patch_size") + // windowSize := m.Config.Uint("vision_config.window_size") + mergeSize := 2 + patchSize := 14 + windowSize := 112 + + // Calculate grid dimensions + // The total patches per dimension = window_size / patch_size + patchesPerDim := windowSize / patchSize + // Grid size after merging = patches per dimension / merge_size + gridSize := patchesPerDim / mergeSize + + // Calculate tokens per grid + tokensPerGrid := gridSize * gridSize + + for _, inp := range inputs { + if inp.Multimodal == nil { + // If not a multimodal input, add it to the result unchanged + result = append(result, inp) + } else if inp.Token == int32(imageToken) { + // This is an image token + inputMultimodal := inp.Multimodal.(ml.Tensor) + + // Replace the image token with multiple placeholder tokens + // First add the vision start token + result = append(result, input.Input{Token: int32(visionStartToken)}) + + // Then add the multimodal tensor data at the first position + result = append(result, + input.Input{ + Multimodal: inputMultimodal, + MultimodalHash: inp.MultimodalHash, + }) + + // Then add the placeholder tokens for the remaining positions + // We subtract 1 from tokensPerGrid because we already added the first token + placeholders := tokensPerGrid - 1 + for range int(placeholders) { + result = append(result, input.Input{Token: int32(imageToken)}) + } + + // Finally add the vision end token + result = append(result, input.Input{Token: int32(visionEndToken)}) + } else { + // For any other token, just pass through + result = append(result, inp) + } + } + + return result, nil } func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) { diff --git a/model/models/qwen25vl/model_test.go b/model/models/qwen25vl/model_test.go new file mode 100644 index 000000000..b9e590a90 --- /dev/null +++ b/model/models/qwen25vl/model_test.go @@ -0,0 +1,59 @@ +package qwen25vl + +import ( + "testing" + + "github.com/ollama/ollama/ml/backend/ggml" + "github.com/ollama/ollama/model/input" +) + +func TestPostTokenize(t *testing.T) { + // Set up test inputs + model := &Model{} + mockHash := uint64(12345678) + + inputs := []input.Input{ + {Token: 123}, // Regular token + {Token: 456}, // Regular token + {Token: 151655, Multimodal: &ggml.Tensor{}, MultimodalHash: mockHash}, // Image token + {Token: 789}, // Regular token + } + + // Run the function being tested + result, err := model.PostTokenize(inputs) + if err != nil { + t.Fatalf("PostTokenize returned error: %v", err) + } + + // Verify the actual length first + expectedLength := 21 + if len(result) != expectedLength { + t.Fatalf("Result has wrong length: got %d, expected %d", len(result), expectedLength) + } + + // Check key positions only + checkPositions := map[int]int32{ + 0: 123, // First regular token + 1: 456, // Second regular token + 2: 151652, // Vision start token + 4: 151655, // First placeholder token + 19: 151653, // Vision end token + 20: 789, // Final regular token + } + + for pos, expectedToken := range checkPositions { + if pos >= len(result) { + t.Errorf("Position %d is out of bounds (result length: %d)", pos, len(result)) + continue + } + if result[pos].Token != expectedToken { + t.Errorf("Position %d: expected token %d, got %d", pos, expectedToken, result[pos].Token) + } + } + + // Check multimodal data is preserved + if result[3].MultimodalHash != mockHash { + t.Errorf("Multimodal hash not preserved: got %d, expected %d", + result[3].MultimodalHash, mockHash) + } +} diff --git a/model/models/qwen25vl/process_image.go b/model/models/qwen25vl/process_image.go new file mode 100644 index 000000000..b6a32fe38 --- /dev/null +++ b/model/models/qwen25vl/process_image.go @@ -0,0 +1,84 @@ +package qwen25vl + +import ( + "image" + "math" + + "github.com/ollama/ollama/ml" + "github.com/ollama/ollama/model/imageproc" +) + +type ImageProcessor struct { + imageSize, numChannels int + factor, minPixels, maxPixels int +} + +func newImageProcessor(c ml.Config) ImageProcessor { + return ImageProcessor{ + imageSize: int(c.Uint("vision.image_size")), + numChannels: 3, // RGB channels + factor: 28, + minPixels: 56 * 56, + maxPixels: 14 * 14 * 4 * 1280, + } +} + +// smartResize calculates the size of the image to resize to based on the +// factor, minPixels, and maxPixels. +func (p *ImageProcessor) smartResize(size image.Point) image.Point { + // 1. Both dimensions of size are divisible by factor + // 2. The area of the image is between minPixels and maxPixels + // 3. The aspect ratio of the image is as close to 1:1 as possible + + if size.Y < p.factor || size.X < p.factor { + panic("image is too small to resize") + } else if max(size.X, size.Y)/min(size.X, size.Y) > 200 { + panic("aspect ratio must be less than 200:1") + } + + f := float64(p.factor) + width := float64(size.X) + height := float64(size.Y) + + xBar := math.Round(width/f) * f + yBar := math.Round(height/f) * f + + if xBar*yBar > float64(p.maxPixels) { + beta := math.Sqrt(height * width / float64(p.maxPixels)) + xBar = math.Floor(width/beta/f) * f + yBar = math.Floor(height/beta/f) * f + } else if xBar*yBar < float64(p.minPixels) { + beta := math.Sqrt(float64(p.minPixels) / (height * width)) + xBar = math.Ceil(width*beta/f) * f + yBar = math.Ceil(height*beta/f) * f + } + + return image.Point{int(xBar), int(yBar)} +} + +func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) { + // Detect PNG by checking for alpha channel + isPNG := false + if _, _, _, a := img.At(0, 0).RGBA(); a < 0xffff { + isPNG = true + } + + size := p.smartResize(img.Bounds().Max) + + // Composite PNG images to handle transparency + if isPNG { + img = imageproc.Composite(img) + } + + // Resize the image + img = imageproc.Resize(img, size, imageproc.ResizeBilinear) + + // Use CLIP normalization values + mean := [3]float32{0.48145466, 0.4578275, 0.40821073} // CLIP mean values + std := [3]float32{0.26862954, 0.26130258, 0.27577711} // CLIP std values + + // Normalize and get the data + data := imageproc.Normalize(img, mean, std, true, true) + + return data, nil +} diff --git a/model/models/qwen25vl/imageproc_test.go b/model/models/qwen25vl/process_image_test.go similarity index 61% rename from model/models/qwen25vl/imageproc_test.go rename to model/models/qwen25vl/process_image_test.go index f088d12e8..b3af8dcdf 100644 --- a/model/models/qwen25vl/imageproc_test.go +++ b/model/models/qwen25vl/process_image_test.go @@ -1,9 +1,8 @@ package qwen25vl import ( - "bytes" "image" - "image/png" + _ "image/jpeg" // Register JPEG decoder "testing" ) @@ -13,6 +12,15 @@ func TestSmartResize(t *testing.T) { Expected image.Point } + // Create an image processor with default values + processor := ImageProcessor{ + imageSize: 224, // Example value + numChannels: 3, + factor: 28, + minPixels: 56 * 56, + maxPixels: 14 * 14 * 4 * 1280, + } + cases := []smartResizeCase{ { TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)), @@ -30,38 +38,41 @@ func TestSmartResize(t *testing.T) { for _, c := range cases { b := c.TestImage.Bounds().Max - actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels) + actual := processor.smartResize(b) if actual != c.Expected { t.Errorf("expected: %v, actual: %v", c.Expected, actual) } } } -func TestPreprocess(t *testing.T) { - type preprocessCase struct { +func TestProcessImage(t *testing.T) { + type processImageCase struct { TestImage image.Image ExpectedLen int } - cases := []preprocessCase{ + // Create an image processor with default values + processor := ImageProcessor{ + imageSize: 224, // Example value + numChannels: 3, + factor: 28, + minPixels: 56 * 56, + maxPixels: 14 * 14 * 4 * 1280, + } + + cases := []processImageCase{ { TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)), - ExpectedLen: 252 * 252 * 3 * 1, + ExpectedLen: 252 * 252 * 3, }, { TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)), - ExpectedLen: 980 * 980 * 3 * 1, + ExpectedLen: 980 * 980 * 3, }, } for _, c := range cases { - var buf bytes.Buffer - err := png.Encode(&buf, c.TestImage) - if err != nil { - t.Fatal(err) - } - - imgData, _, err := Preprocess(&buf) + imgData, err := processor.ProcessImage(c.TestImage) if err != nil { t.Fatalf("error processing: %q", err) }