mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 09:17:50 +01:00
For each memory allocation we report the size of the (attempted) allocation and whether it succeeded or failed. The latter status reporting proved to be not that useful in practice as systems such as Windows can automatically overflow from VRAM into RAM, resultings in successful allocations even when there isn't enough memory where we wanted. As a result, this information is only used for debug logging, which isn't worthwhile enough for the amount of code. It also isn't fully accurate, as multiple allocations may result in partial failures.
242 lines
7.3 KiB
Go
242 lines
7.3 KiB
Go
package llm
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/discover"
|
|
"github.com/ollama/ollama/format"
|
|
"github.com/ollama/ollama/ml"
|
|
"golang.org/x/sync/semaphore"
|
|
)
|
|
|
|
func TestLLMServerFitGPU(t *testing.T) {
|
|
type gpu struct {
|
|
library string
|
|
free int
|
|
}
|
|
|
|
tests := []struct {
|
|
name string
|
|
gpus []gpu
|
|
layers []int
|
|
numGPU int
|
|
requireFull bool
|
|
expected ml.GPULayersList
|
|
expectedErr error
|
|
}{
|
|
{
|
|
name: "No GPU",
|
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: -1,
|
|
expected: ml.GPULayersList{},
|
|
},
|
|
{
|
|
name: "Full single GPU",
|
|
gpus: []gpu{{free: 256 * format.MebiByte}},
|
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: -1,
|
|
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2}}},
|
|
},
|
|
{
|
|
name: "Partial single GPU",
|
|
gpus: []gpu{{free: 256 * format.MebiByte}},
|
|
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
|
numGPU: -1,
|
|
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1, 2}}},
|
|
},
|
|
{
|
|
name: "Single GPU with numGPU 1",
|
|
gpus: []gpu{{free: 256 * format.MebiByte}},
|
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: 1,
|
|
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{1}}},
|
|
},
|
|
{
|
|
name: "Single GPU with numGPU 0",
|
|
gpus: []gpu{{free: 256 * format.MebiByte}},
|
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: 0,
|
|
expected: ml.GPULayersList{},
|
|
},
|
|
{
|
|
name: "Single GPU with numGPU 999",
|
|
gpus: []gpu{{free: 256 * format.MebiByte}},
|
|
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
|
numGPU: 999,
|
|
expected: ml.GPULayersList{{ID: "gpu0", Layers: []int{0, 1, 2, 3}}},
|
|
},
|
|
{
|
|
name: "Multi GPU fits on one",
|
|
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: -1,
|
|
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1, 2}}},
|
|
},
|
|
{
|
|
name: "Multi GPU split",
|
|
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
|
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: -1,
|
|
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1, 2}}},
|
|
},
|
|
{
|
|
name: "Multi GPU partial",
|
|
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
|
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: -1,
|
|
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
|
|
},
|
|
{
|
|
name: "Multi GPU numGPU 1",
|
|
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: 1,
|
|
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{1}}},
|
|
},
|
|
{
|
|
name: "Multi GPU numGPU 2",
|
|
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
|
layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: 2,
|
|
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0}}, {ID: "gpu0", Layers: []int{1}}},
|
|
},
|
|
{
|
|
name: "Multi GPU numGPU 999",
|
|
gpus: []gpu{{free: 128 * format.MebiByte}, {free: 256 * format.MebiByte}},
|
|
layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: 999,
|
|
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}, {ID: "gpu0", Layers: []int{2}}},
|
|
},
|
|
{
|
|
name: "Multi GPU different libraries",
|
|
gpus: []gpu{{library: "cuda", free: 128 * format.MebiByte}, {library: "rocm", free: 256 * format.MebiByte}},
|
|
layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte},
|
|
numGPU: -1,
|
|
expected: ml.GPULayersList{{ID: "gpu1", Layers: []int{0, 1}}},
|
|
},
|
|
{
|
|
name: "requireFull",
|
|
gpus: []gpu{{free: 256 * format.MebiByte}},
|
|
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
|
|
numGPU: -1,
|
|
requireFull: true,
|
|
expectedErr: ErrLoadRequiredFull,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
var systemInfo discover.SystemInfo
|
|
systemInfo.System.TotalMemory = format.GibiByte
|
|
systemInfo.System.FreeMemory = 512 * format.MebiByte
|
|
systemInfo.System.FreeSwap = 256 * format.MebiByte
|
|
|
|
gpus := make(discover.GpuInfoList, len(tt.gpus))
|
|
for i := range tt.gpus {
|
|
gpus[i].ID = fmt.Sprintf("gpu%d", i)
|
|
gpus[i].Library = tt.gpus[i].library
|
|
gpus[i].FreeMemory = uint64(tt.gpus[i].free)
|
|
}
|
|
|
|
s := &ollamaServer{
|
|
llmServer: llmServer{
|
|
totalLayers: uint64(len(tt.layers)),
|
|
options: api.Options{
|
|
Runner: api.Runner{
|
|
NumGPU: tt.numGPU,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
|
|
Weights: make([]uint64, s.totalLayers),
|
|
Cache: make([]uint64, s.totalLayers),
|
|
}, GPUs: make([]ml.DeviceMemory, len(gpus))}
|
|
|
|
for i := range tt.layers {
|
|
s.mem.CPU.Weights[i] = uint64(tt.layers[i])
|
|
}
|
|
|
|
for i := range s.mem.GPUs {
|
|
s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
|
|
s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
|
|
s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
|
|
}
|
|
|
|
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)
|
|
if err != tt.expectedErr {
|
|
t.Fatalf("fitGPU returned error: %v", err)
|
|
}
|
|
if gpuLayers.Hash() != tt.expected.Hash() {
|
|
t.Errorf("fitGPU assigned %v, want %v", gpuLayers, tt.expected)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestLLMServerCompletionFormat(t *testing.T) {
|
|
// This test was written to fix an already deployed issue. It is a bit
|
|
// of a mess, and but it's good enough, until we can refactoring the
|
|
// Completion method to be more testable.
|
|
|
|
ctx, cancel := context.WithCancel(t.Context())
|
|
s := &llmServer{
|
|
sem: semaphore.NewWeighted(1), // required to prevent nil panic
|
|
}
|
|
|
|
checkInvalid := func(format string) {
|
|
t.Helper()
|
|
err := s.Completion(ctx, CompletionRequest{
|
|
Options: new(api.Options),
|
|
Format: []byte(format),
|
|
}, nil)
|
|
|
|
want := fmt.Sprintf("invalid format: %q; expected \"json\" or a valid JSON Schema", format)
|
|
if err == nil || !strings.Contains(err.Error(), want) {
|
|
t.Fatalf("err = %v; want %q", err, want)
|
|
}
|
|
}
|
|
|
|
checkInvalid("X") // invalid format
|
|
checkInvalid(`"X"`) // invalid JSON Schema
|
|
|
|
cancel() // prevent further processing if request makes it past the format check
|
|
|
|
checkValid := func(err error) {
|
|
t.Helper()
|
|
if !errors.Is(err, context.Canceled) {
|
|
t.Fatalf("Completion: err = %v; expected context.Canceled", err)
|
|
}
|
|
}
|
|
|
|
valids := []string{
|
|
// "missing"
|
|
``,
|
|
`""`,
|
|
`null`,
|
|
|
|
// JSON
|
|
`"json"`,
|
|
`{"type":"object"}`,
|
|
}
|
|
for _, valid := range valids {
|
|
err := s.Completion(ctx, CompletionRequest{
|
|
Options: new(api.Options),
|
|
Format: []byte(valid),
|
|
}, nil)
|
|
checkValid(err)
|
|
}
|
|
|
|
err := s.Completion(ctx, CompletionRequest{
|
|
Options: new(api.Options),
|
|
Format: nil, // missing format
|
|
}, nil)
|
|
checkValid(err)
|
|
}
|