mirror of
https://github.com/ollama/ollama.git
synced 2025-07-15 02:12:53 +02:00
image processing
This commit is contained in:
@ -1,74 +0,0 @@
|
|||||||
package qwen25vl
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"image"
|
|
||||||
_ "image/jpeg"
|
|
||||||
_ "image/png"
|
|
||||||
"io"
|
|
||||||
"math"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/model/imageproc"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
DefaultFactor = 28
|
|
||||||
DefaultMinPixels = 56 * 56
|
|
||||||
DefaultMaxPixels = 14 * 14 * 4 * 1280
|
|
||||||
)
|
|
||||||
|
|
||||||
// smartResize calculates the size of the image to resize to based on the
|
|
||||||
// factor, minPixels, and maxPixels.
|
|
||||||
func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
|
|
||||||
// 1. Both dimensions of size are divisible by factor
|
|
||||||
// 2. The area of the image is between minPixels and maxPixels
|
|
||||||
// 3. The aspect ratio of the image is as close to 1:1 as possible
|
|
||||||
|
|
||||||
if size.Y < factor || size.X < factor {
|
|
||||||
panic("image is too small to resize")
|
|
||||||
} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
|
|
||||||
panic("aspect ratio must be less than 200:1")
|
|
||||||
}
|
|
||||||
|
|
||||||
f := float64(factor)
|
|
||||||
width := float64(size.X)
|
|
||||||
height := float64(size.Y)
|
|
||||||
|
|
||||||
xBar := math.Round(width/f) * f
|
|
||||||
yBar := math.Round(height/f) * f
|
|
||||||
|
|
||||||
if xBar*yBar > float64(maxPixels) {
|
|
||||||
beta := math.Sqrt(height * width / float64(maxPixels))
|
|
||||||
xBar = math.Floor(width/beta/f) * f
|
|
||||||
yBar = math.Floor(height/beta/f) * f
|
|
||||||
} else if xBar*yBar < float64(minPixels) {
|
|
||||||
beta := math.Sqrt(float64(minPixels) / (height * width))
|
|
||||||
xBar = math.Ceil(width*beta/f) * f
|
|
||||||
yBar = math.Ceil(height*beta/f) * f
|
|
||||||
}
|
|
||||||
|
|
||||||
return image.Point{int(xBar), int(yBar)}
|
|
||||||
}
|
|
||||||
|
|
||||||
func resizeImage(img image.Image, format string, size image.Point) image.Image {
|
|
||||||
if format == "png" {
|
|
||||||
img = imageproc.Composite(img)
|
|
||||||
}
|
|
||||||
|
|
||||||
return imageproc.Resize(img, size, imageproc.ResizeBilinear)
|
|
||||||
}
|
|
||||||
|
|
||||||
func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
|
||||||
img, format, err := image.Decode(imageData)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
|
|
||||||
img = resizeImage(img, format, size)
|
|
||||||
|
|
||||||
data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
|
|
||||||
|
|
||||||
opts := map[string]any{}
|
|
||||||
return data, opts, nil
|
|
||||||
}
|
|
@ -1,6 +1,10 @@
|
|||||||
package qwen25vl
|
package qwen25vl
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"image"
|
||||||
|
|
||||||
"github.com/ollama/ollama/kvcache"
|
"github.com/ollama/ollama/kvcache"
|
||||||
"github.com/ollama/ollama/ml"
|
"github.com/ollama/ollama/ml"
|
||||||
"github.com/ollama/ollama/model"
|
"github.com/ollama/ollama/model"
|
||||||
@ -13,7 +17,7 @@ type Model struct {
|
|||||||
// *VisionModel `gguf:"v,vision"`
|
// *VisionModel `gguf:"v,vision"`
|
||||||
// *MultiModalProjector `gguf:"mm"`
|
// *MultiModalProjector `gguf:"mm"`
|
||||||
|
|
||||||
// ImageProcessor
|
ImageProcessor
|
||||||
}
|
}
|
||||||
|
|
||||||
// Implement MultimodalProcessor interface
|
// Implement MultimodalProcessor interface
|
||||||
@ -23,7 +27,7 @@ func New(c ml.Config) (model.Model, error) {
|
|||||||
m := &Model{
|
m := &Model{
|
||||||
TextModel: NewTextModel(c),
|
TextModel: NewTextModel(c),
|
||||||
// VisionModel: newVisionModel(c),
|
// VisionModel: newVisionModel(c),
|
||||||
// ImageProcessor: newImageProcessor(c),
|
ImageProcessor: newImageProcessor(c),
|
||||||
// MultiModalProjector: newMultiModalProjector(c),
|
// MultiModalProjector: newMultiModalProjector(c),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -33,12 +37,102 @@ func New(c ml.Config) (model.Model, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
|
||||||
|
// if len(m.VisionModel.Layers) == 0 {
|
||||||
|
// return nil, model.ErrNoVisionModel
|
||||||
|
// }
|
||||||
|
|
||||||
|
image, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
f32s, err := m.ImageProcessor.ProcessImage(image)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
pixelValues, err := ctx.Input().FromFloatSlice(f32s,
|
||||||
|
m.ImageProcessor.imageSize,
|
||||||
|
m.ImageProcessor.imageSize,
|
||||||
|
m.ImageProcessor.numChannels,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Println("pixelValues", pixelValues)
|
||||||
|
|
||||||
return nil, nil
|
return nil, nil
|
||||||
|
|
||||||
|
// visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
|
||||||
|
// visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
|
||||||
|
// return visionOutputs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
|
// PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
|
||||||
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
|
||||||
return inputs, nil
|
var result []input.Input
|
||||||
|
|
||||||
|
// Get image token IDs from config
|
||||||
|
// imageToken := m.Config.Uint("image_token_id")
|
||||||
|
// visionStartToken := m.Config.Uint("vision_start_token_id")
|
||||||
|
// visionEndToken := m.Config.Uint("vision_end_token_id")
|
||||||
|
imageToken := 151655
|
||||||
|
visionStartToken := 151652
|
||||||
|
visionEndToken := 151653
|
||||||
|
|
||||||
|
// Get merge size from vision config
|
||||||
|
// mergeSize := m.Config.Uint("vision_config.spatial_merge_size")
|
||||||
|
// patchSize := m.Config.Uint("vision_config.spatial_patch_size")
|
||||||
|
// windowSize := m.Config.Uint("vision_config.window_size")
|
||||||
|
mergeSize := 2
|
||||||
|
patchSize := 14
|
||||||
|
windowSize := 112
|
||||||
|
|
||||||
|
// Calculate grid dimensions
|
||||||
|
// The total patches per dimension = window_size / patch_size
|
||||||
|
patchesPerDim := windowSize / patchSize
|
||||||
|
// Grid size after merging = patches per dimension / merge_size
|
||||||
|
gridSize := patchesPerDim / mergeSize
|
||||||
|
|
||||||
|
// Calculate tokens per grid
|
||||||
|
tokensPerGrid := gridSize * gridSize
|
||||||
|
|
||||||
|
for _, inp := range inputs {
|
||||||
|
if inp.Multimodal == nil {
|
||||||
|
// If not a multimodal input, add it to the result unchanged
|
||||||
|
result = append(result, inp)
|
||||||
|
} else if inp.Token == int32(imageToken) {
|
||||||
|
// This is an image token
|
||||||
|
inputMultimodal := inp.Multimodal.(ml.Tensor)
|
||||||
|
|
||||||
|
// Replace the image token with multiple placeholder tokens
|
||||||
|
// First add the vision start token
|
||||||
|
result = append(result, input.Input{Token: int32(visionStartToken)})
|
||||||
|
|
||||||
|
// Then add the multimodal tensor data at the first position
|
||||||
|
result = append(result,
|
||||||
|
input.Input{
|
||||||
|
Multimodal: inputMultimodal,
|
||||||
|
MultimodalHash: inp.MultimodalHash,
|
||||||
|
})
|
||||||
|
|
||||||
|
// Then add the placeholder tokens for the remaining positions
|
||||||
|
// We subtract 1 from tokensPerGrid because we already added the first token
|
||||||
|
placeholders := tokensPerGrid - 1
|
||||||
|
for range int(placeholders) {
|
||||||
|
result = append(result, input.Input{Token: int32(imageToken)})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally add the vision end token
|
||||||
|
result = append(result, input.Input{Token: int32(visionEndToken)})
|
||||||
|
} else {
|
||||||
|
// For any other token, just pass through
|
||||||
|
result = append(result, inp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||||
|
59
model/models/qwen25vl/model_test.go
Normal file
59
model/models/qwen25vl/model_test.go
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
package qwen25vl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/ml/backend/ggml"
|
||||||
|
"github.com/ollama/ollama/model/input"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestPostTokenize(t *testing.T) {
|
||||||
|
// Set up test inputs
|
||||||
|
model := &Model{}
|
||||||
|
mockHash := uint64(12345678)
|
||||||
|
|
||||||
|
inputs := []input.Input{
|
||||||
|
{Token: 123}, // Regular token
|
||||||
|
{Token: 456}, // Regular token
|
||||||
|
{Token: 151655, Multimodal: &ggml.Tensor{}, MultimodalHash: mockHash}, // Image token
|
||||||
|
{Token: 789}, // Regular token
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the function being tested
|
||||||
|
result, err := model.PostTokenize(inputs)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("PostTokenize returned error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the actual length first
|
||||||
|
expectedLength := 21
|
||||||
|
if len(result) != expectedLength {
|
||||||
|
t.Fatalf("Result has wrong length: got %d, expected %d", len(result), expectedLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check key positions only
|
||||||
|
checkPositions := map[int]int32{
|
||||||
|
0: 123, // First regular token
|
||||||
|
1: 456, // Second regular token
|
||||||
|
2: 151652, // Vision start token
|
||||||
|
4: 151655, // First placeholder token
|
||||||
|
19: 151653, // Vision end token
|
||||||
|
20: 789, // Final regular token
|
||||||
|
}
|
||||||
|
|
||||||
|
for pos, expectedToken := range checkPositions {
|
||||||
|
if pos >= len(result) {
|
||||||
|
t.Errorf("Position %d is out of bounds (result length: %d)", pos, len(result))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if result[pos].Token != expectedToken {
|
||||||
|
t.Errorf("Position %d: expected token %d, got %d", pos, expectedToken, result[pos].Token)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check multimodal data is preserved
|
||||||
|
if result[3].MultimodalHash != mockHash {
|
||||||
|
t.Errorf("Multimodal hash not preserved: got %d, expected %d",
|
||||||
|
result[3].MultimodalHash, mockHash)
|
||||||
|
}
|
||||||
|
}
|
84
model/models/qwen25vl/process_image.go
Normal file
84
model/models/qwen25vl/process_image.go
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
package qwen25vl
|
||||||
|
|
||||||
|
import (
|
||||||
|
"image"
|
||||||
|
"math"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/ml"
|
||||||
|
"github.com/ollama/ollama/model/imageproc"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ImageProcessor struct {
|
||||||
|
imageSize, numChannels int
|
||||||
|
factor, minPixels, maxPixels int
|
||||||
|
}
|
||||||
|
|
||||||
|
func newImageProcessor(c ml.Config) ImageProcessor {
|
||||||
|
return ImageProcessor{
|
||||||
|
imageSize: int(c.Uint("vision.image_size")),
|
||||||
|
numChannels: 3, // RGB channels
|
||||||
|
factor: 28,
|
||||||
|
minPixels: 56 * 56,
|
||||||
|
maxPixels: 14 * 14 * 4 * 1280,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// smartResize calculates the size of the image to resize to based on the
|
||||||
|
// factor, minPixels, and maxPixels.
|
||||||
|
func (p *ImageProcessor) smartResize(size image.Point) image.Point {
|
||||||
|
// 1. Both dimensions of size are divisible by factor
|
||||||
|
// 2. The area of the image is between minPixels and maxPixels
|
||||||
|
// 3. The aspect ratio of the image is as close to 1:1 as possible
|
||||||
|
|
||||||
|
if size.Y < p.factor || size.X < p.factor {
|
||||||
|
panic("image is too small to resize")
|
||||||
|
} else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
|
||||||
|
panic("aspect ratio must be less than 200:1")
|
||||||
|
}
|
||||||
|
|
||||||
|
f := float64(p.factor)
|
||||||
|
width := float64(size.X)
|
||||||
|
height := float64(size.Y)
|
||||||
|
|
||||||
|
xBar := math.Round(width/f) * f
|
||||||
|
yBar := math.Round(height/f) * f
|
||||||
|
|
||||||
|
if xBar*yBar > float64(p.maxPixels) {
|
||||||
|
beta := math.Sqrt(height * width / float64(p.maxPixels))
|
||||||
|
xBar = math.Floor(width/beta/f) * f
|
||||||
|
yBar = math.Floor(height/beta/f) * f
|
||||||
|
} else if xBar*yBar < float64(p.minPixels) {
|
||||||
|
beta := math.Sqrt(float64(p.minPixels) / (height * width))
|
||||||
|
xBar = math.Ceil(width*beta/f) * f
|
||||||
|
yBar = math.Ceil(height*beta/f) * f
|
||||||
|
}
|
||||||
|
|
||||||
|
return image.Point{int(xBar), int(yBar)}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
|
||||||
|
// Detect PNG by checking for alpha channel
|
||||||
|
isPNG := false
|
||||||
|
if _, _, _, a := img.At(0, 0).RGBA(); a < 0xffff {
|
||||||
|
isPNG = true
|
||||||
|
}
|
||||||
|
|
||||||
|
size := p.smartResize(img.Bounds().Max)
|
||||||
|
|
||||||
|
// Composite PNG images to handle transparency
|
||||||
|
if isPNG {
|
||||||
|
img = imageproc.Composite(img)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resize the image
|
||||||
|
img = imageproc.Resize(img, size, imageproc.ResizeBilinear)
|
||||||
|
|
||||||
|
// Use CLIP normalization values
|
||||||
|
mean := [3]float32{0.48145466, 0.4578275, 0.40821073} // CLIP mean values
|
||||||
|
std := [3]float32{0.26862954, 0.26130258, 0.27577711} // CLIP std values
|
||||||
|
|
||||||
|
// Normalize and get the data
|
||||||
|
data := imageproc.Normalize(img, mean, std, true, true)
|
||||||
|
|
||||||
|
return data, nil
|
||||||
|
}
|
@ -1,9 +1,8 @@
|
|||||||
package qwen25vl
|
package qwen25vl
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"image"
|
"image"
|
||||||
"image/png"
|
_ "image/jpeg" // Register JPEG decoder
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -13,6 +12,15 @@ func TestSmartResize(t *testing.T) {
|
|||||||
Expected image.Point
|
Expected image.Point
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create an image processor with default values
|
||||||
|
processor := ImageProcessor{
|
||||||
|
imageSize: 224, // Example value
|
||||||
|
numChannels: 3,
|
||||||
|
factor: 28,
|
||||||
|
minPixels: 56 * 56,
|
||||||
|
maxPixels: 14 * 14 * 4 * 1280,
|
||||||
|
}
|
||||||
|
|
||||||
cases := []smartResizeCase{
|
cases := []smartResizeCase{
|
||||||
{
|
{
|
||||||
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
|
TestImage: image.NewRGBA(image.Rect(0, 0, 1024, 1024)),
|
||||||
@ -30,38 +38,41 @@ func TestSmartResize(t *testing.T) {
|
|||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
b := c.TestImage.Bounds().Max
|
b := c.TestImage.Bounds().Max
|
||||||
actual := smartResize(b, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
|
actual := processor.smartResize(b)
|
||||||
if actual != c.Expected {
|
if actual != c.Expected {
|
||||||
t.Errorf("expected: %v, actual: %v", c.Expected, actual)
|
t.Errorf("expected: %v, actual: %v", c.Expected, actual)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestPreprocess(t *testing.T) {
|
func TestProcessImage(t *testing.T) {
|
||||||
type preprocessCase struct {
|
type processImageCase struct {
|
||||||
TestImage image.Image
|
TestImage image.Image
|
||||||
ExpectedLen int
|
ExpectedLen int
|
||||||
}
|
}
|
||||||
|
|
||||||
cases := []preprocessCase{
|
// Create an image processor with default values
|
||||||
|
processor := ImageProcessor{
|
||||||
|
imageSize: 224, // Example value
|
||||||
|
numChannels: 3,
|
||||||
|
factor: 28,
|
||||||
|
minPixels: 56 * 56,
|
||||||
|
maxPixels: 14 * 14 * 4 * 1280,
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []processImageCase{
|
||||||
{
|
{
|
||||||
TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)),
|
TestImage: image.NewRGBA(image.Rect(0, 0, 256, 256)),
|
||||||
ExpectedLen: 252 * 252 * 3 * 1,
|
ExpectedLen: 252 * 252 * 3,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
TestImage: image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
|
||||||
ExpectedLen: 980 * 980 * 3 * 1,
|
ExpectedLen: 980 * 980 * 3,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, c := range cases {
|
for _, c := range cases {
|
||||||
var buf bytes.Buffer
|
imgData, err := processor.ProcessImage(c.TestImage)
|
||||||
err := png.Encode(&buf, c.TestImage)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
imgData, _, err := Preprocess(&buf)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("error processing: %q", err)
|
t.Fatalf("error processing: %q", err)
|
||||||
}
|
}
|
Reference in New Issue
Block a user