image processing

This commit is contained in:
Bruce MacDonald 2025-04-02 14:28:27 -07:00
parent ace6176af9
commit 733630a491
5 changed files with 266 additions and 92 deletions

View File

@ -1,74 +0,0 @@
package qwen25vl
import (
"fmt"
"image"
_ "image/jpeg"
_ "image/png"
"io"
"math"
"github.com/ollama/ollama/model/imageproc"
)
const (
DefaultFactor = 28
DefaultMinPixels = 56 * 56
DefaultMaxPixels = 14 * 14 * 4 * 1280
)
// smartResize calculates the size of the image to resize to based on the
// factor, minPixels, and maxPixels.
func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
// 1. Both dimensions of size are divisible by factor
// 2. The area of the image is between minPixels and maxPixels
// 3. The aspect ratio of the image is as close to 1:1 as possible
if size.Y < factor || size.X < factor {
panic("image is too small to resize")
} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
panic("aspect ratio must be less than 200:1")
}
f := float64(factor)
width := float64(size.X)
height := float64(size.Y)
xBar := math.Round(width/f) * f
yBar := math.Round(height/f) * f
if xBar*yBar > float64(maxPixels) {
beta := math.Sqrt(height * width / float64(maxPixels))
xBar = math.Floor(width/beta/f) * f
yBar = math.Floor(height/beta/f) * f
} else if xBar*yBar < float64(minPixels) {
beta := math.Sqrt(float64(minPixels) / (height * width))
xBar = math.Ceil(width*beta/f) * f
yBar = math.Ceil(height*beta/f) * f
}
return image.Point{int(xBar), int(yBar)}
}
func resizeImage(img image.Image, format string, size image.Point) image.Image {
if format == "png" {
img = imageproc.Composite(img)
}
return imageproc.Resize(img, size, imageproc.ResizeBilinear)
}
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
img, format, err := image.Decode(imageData)
if err != nil {
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
}
size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
img = resizeImage(img, format, size)
data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
opts := map[string]any{}
return data, opts, nil
}

View File

@ -1,6 +1,10 @@
package qwen25vl
import (
"bytes"
"fmt"
"image"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
@ -13,7 +17,7 @@ type Model struct {
// *VisionModel `gguf:"v,vision"`
// *MultiModalProjector `gguf:"mm"`
// ImageProcessor
ImageProcessor
}
// Implement MultimodalProcessor interface
@ -23,7 +27,7 @@ func New(c ml.Config) (model.Model, error) {
m := &Model{
TextModel: NewTextModel(c),
// VisionModel: newVisionModel(c),
// ImageProcessor: newImageProcessor(c),
ImageProcessor: newImageProcessor(c),
// MultiModalProjector: newMultiModalProjector(c),
}
@ -33,12 +37,102 @@ func New(c ml.Config) (model.Model, error) {
}
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
// if len(m.VisionModel.Layers) == 0 {
// return nil, model.ErrNoVisionModel
// }
image, _, err := image.Decode(bytes.NewReader(multimodalData))
if err != nil {
return nil, err
}
f32s, err := m.ImageProcessor.ProcessImage(image)
if err != nil {
return nil, err
}
pixelValues, err := ctx.Input().FromFloatSlice(f32s,
m.ImageProcessor.imageSize,
m.ImageProcessor.imageSize,
m.ImageProcessor.numChannels,
)
if err != nil {
return nil, err
}
fmt.Println("pixelValues", pixelValues)
return nil, nil
// visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
// visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
// return visionOutputs, nil
}
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
return inputs, nil
var result []input.Input
// Get image token IDs from config
// imageToken := m.Config.Uint("image_token_id")
// visionStartToken := m.Config.Uint("vision_start_token_id")
// visionEndToken := m.Config.Uint("vision_end_token_id")
imageToken := 151655
visionStartToken := 151652
visionEndToken := 151653
// Get merge size from vision config
// mergeSize := m.Config.Uint("vision_config.spatial_merge_size")
// patchSize := m.Config.Uint("vision_config.spatial_patch_size")
// windowSize := m.Config.Uint("vision_config.window_size")
mergeSize := 2
patchSize := 14
windowSize := 112
// Calculate grid dimensions
// The total patches per dimension = window_size / patch_size
patchesPerDim := windowSize / patchSize
// Grid size after merging = patches per dimension / merge_size
gridSize := patchesPerDim / mergeSize
// Calculate tokens per grid
tokensPerGrid := gridSize * gridSize
for _, inp := range inputs {
if inp.Multimodal == nil {
// If not a multimodal input, add it to the result unchanged
result = append(result, inp)
} else if inp.Token == int32(imageToken) {
// This is an image token
inputMultimodal := inp.Multimodal.(ml.Tensor)
// Replace the image token with multiple placeholder tokens
// First add the vision start token
result = append(result, input.Input{Token: int32(visionStartToken)})
// Then add the multimodal tensor data at the first position
result = append(result,
input.Input{
Multimodal: inputMultimodal,
MultimodalHash: inp.MultimodalHash,
})
// Then add the placeholder tokens for the remaining positions
// We subtract 1 from tokensPerGrid because we already added the first token
placeholders := tokensPerGrid - 1
for range int(placeholders) {
result = append(result, input.Input{Token: int32(imageToken)})
}
// Finally add the vision end token
result = append(result, input.Input{Token: int32(visionEndToken)})
} else {
// For any other token, just pass through
result = append(result, inp)
}
}
return result, nil
}
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {

View File

@ -0,0 +1,59 @@
package qwen25vl
import (
"testing"
"github.com/ollama/ollama/ml/backend/ggml"
"github.com/ollama/ollama/model/input"
)
func TestPostTokenize(t *testing.T) {
// Set up test inputs
model := &Model{}
mockHash := uint64(12345678)
inputs := []input.Input{
{Token: 123}, // Regular token
{Token: 456}, // Regular token
{Token: 151655, Multimodal: &ggml.Tensor{}, MultimodalHash: mockHash}, // Image token
{Token: 789}, // Regular token
}
// Run the function being tested
result, err := model.PostTokenize(inputs)
if err != nil {
t.Fatalf("PostTokenize returned error: %v", err)
}
// Verify the actual length first
expectedLength := 21
if len(result) != expectedLength {
t.Fatalf("Result has wrong length: got %d, expected %d", len(result), expectedLength)
}
// Check key positions only
checkPositions := map[int]int32{
0: 123, // First regular token
1: 456, // Second regular token
2: 151652, // Vision start token
4: 151655, // First placeholder token
19: 151653, // Vision end token
20: 789, // Final regular token
}
for pos, expectedToken := range checkPositions {
if pos >= len(result) {
t.Errorf("Position %d is out of bounds (result length: %d)", pos, len(result))
continue
}
if result[pos].Token != expectedToken {
t.Errorf("Position %d: expected token %d, got %d", pos, expectedToken, result[pos].Token)
}
}
// Check multimodal data is preserved
if result[3].MultimodalHash != mockHash {
t.Errorf("Multimodal hash not preserved: got %d, expected %d",
result[3].MultimodalHash, mockHash)
}
}

View File

@ -0,0 +1,84 @@
package qwen25vl
import (
"image"
"math"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/imageproc"
)
type ImageProcessor struct {
imageSize, numChannels int
factor, minPixels, maxPixels int
}
func newImageProcessor(c ml.Config) ImageProcessor {
return ImageProcessor{
imageSize: int(c.Uint("vision.image_size")),
numChannels: 3, // RGB channels
factor: 28,
minPixels: 56 * 56,
maxPixels: 14 * 14 * 4 * 1280,
}
}
// smartResize calculates the size of the image to resize to based on the
// factor, minPixels, and maxPixels.
func (p *ImageProcessor) smartResize(size image.Point) image.Point {
// 1. Both dimensions of size are divisible by factor
// 2. The area of the image is between minPixels and maxPixels
// 3. The aspect ratio of the image is as close to 1:1 as possible
if size.Y < p.factor || size.X < p.factor {
panic("image is too small to resize")
} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
panic("aspect ratio must be less than 200:1")
}
f := float64(p.factor)
width := float64(size.X)
height := float64(size.Y)
xBar := math.Round(width/f) * f
yBar := math.Round(height/f) * f
if xBar*yBar > float64(p.maxPixels) {
beta := math.Sqrt(height * width / float64(p.maxPixels))
xBar = math.Floor(width/beta/f) * f
yBar = math.Floor(height/beta/f) * f
} else if xBar*yBar < float64(p.minPixels) {
beta := math.Sqrt(float64(p.minPixels) / (height * width))
xBar = math.Ceil(width*beta/f) * f
yBar = math.Ceil(height*beta/f) * f
}
return image.Point{int(xBar), int(yBar)}
}
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
// Detect PNG by checking for alpha channel
isPNG := false
if _, _, _, a := img.At(0, 0).RGBA(); a < 0xffff {
isPNG = true
}
size := p.smartResize(img.Bounds().Max)
// Composite PNG images to handle transparency
if isPNG {
img = imageproc.Composite(img)
}
// Resize the image
img = imageproc.Resize(img, size, imageproc.ResizeBilinear)
// Use CLIP normalization values
mean := [3]float32{0.48145466, 0.4578275, 0.40821073} // CLIP mean values
std := [3]float32{0.26862954, 0.26130258, 0.27577711} // CLIP std values
// Normalize and get the data
data := imageproc.Normalize(img, mean, std, true, true)
return data, nil
}

View File

@ -1,9 +1,8 @@
package qwen25vl
import (
"bytes"
"image"
"image/png"
_ "image/jpeg" // Register JPEG decoder
"testing"
)
@ -13,6 +12,15 @@ func TestSmartResize(t *testing.T) {
Expected image.Point
}
// Create an image processor with default values
processor := ImageProcessor{
imageSize: 224, // Example value
numChannels: 3,
factor: 28,
minPixels: 56 * 56,
maxPixels: 14 * 14 * 4 * 1280,
}
cases := []smartResizeCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
@ -30,38 +38,41 @@ func TestSmartResize(t *testing.T) {
for _, c := range cases {
b := c.TestImage.Bounds().Max
actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
actual := processor.smartResize(b)
if actual != c.Expected {
t.Errorf("expected: %v, actual: %v", c.Expected, actual)
}
}
}
func TestPreprocess(t *testing.T) {
type preprocessCase struct {
func TestProcessImage(t *testing.T) {
type processImageCase struct {
TestImage image.Image
ExpectedLen int
}
cases := []preprocessCase{
// Create an image processor with default values
processor := ImageProcessor{
imageSize: 224, // Example value
numChannels: 3,
factor: 28,
minPixels: 56 * 56,
maxPixels: 14 * 14 * 4 * 1280,
}
cases := []processImageCase{
{
TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)),
ExpectedLen: 252 * 252 * 3 * 1,
ExpectedLen: 252 * 252 * 3,
},
{
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
ExpectedLen: 980 * 980 * 3 * 1,
ExpectedLen: 980 * 980 * 3,
},
}
for _, c := range cases {
var buf bytes.Buffer
err := png.Encode(&buf, c.TestImage)
if err != nil {
t.Fatal(err)
}
imgData, _, err := Preprocess(&buf)
imgData, err := processor.ProcessImage(c.TestImage)
if err != nil {
t.Fatalf("error processing: %q", err)
}