mirror of
https://github.com/ollama/ollama.git
synced 2025-04-14 22:59:22 +02:00
This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1
294 lines
10 KiB
Go
294 lines
10 KiB
Go
package envconfig
|
|
|
|
import (
|
|
"fmt"
|
|
"log/slog"
|
|
"math"
|
|
"net"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
|
|
// Default is scheme "http" and host "127.0.0.1:11434"
|
|
func Host() *url.URL {
|
|
defaultPort := "11434"
|
|
|
|
s := strings.TrimSpace(Var("OLLAMA_HOST"))
|
|
scheme, hostport, ok := strings.Cut(s, "://")
|
|
switch {
|
|
case !ok:
|
|
scheme, hostport = "http", s
|
|
case scheme == "http":
|
|
defaultPort = "80"
|
|
case scheme == "https":
|
|
defaultPort = "443"
|
|
}
|
|
|
|
hostport, path, _ := strings.Cut(hostport, "/")
|
|
host, port, err := net.SplitHostPort(hostport)
|
|
if err != nil {
|
|
host, port = "127.0.0.1", defaultPort
|
|
if ip := net.ParseIP(strings.Trim(hostport, "[]")); ip != nil {
|
|
host = ip.String()
|
|
} else if hostport != "" {
|
|
host = hostport
|
|
}
|
|
}
|
|
|
|
if n, err := strconv.ParseInt(port, 10, 32); err != nil || n > 65535 || n < 0 {
|
|
slog.Warn("invalid port, using default", "port", port, "default", defaultPort)
|
|
port = defaultPort
|
|
}
|
|
|
|
return &url.URL{
|
|
Scheme: scheme,
|
|
Host: net.JoinHostPort(host, port),
|
|
Path: path,
|
|
}
|
|
}
|
|
|
|
// Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
|
|
func Origins() (origins []string) {
|
|
if s := Var("OLLAMA_ORIGINS"); s != "" {
|
|
origins = strings.Split(s, ",")
|
|
}
|
|
|
|
for _, origin := range []string{"localhost", "127.0.0.1", "0.0.0.0"} {
|
|
origins = append(origins,
|
|
fmt.Sprintf("http://%s", origin),
|
|
fmt.Sprintf("https://%s", origin),
|
|
fmt.Sprintf("http://%s", net.JoinHostPort(origin, "*")),
|
|
fmt.Sprintf("https://%s", net.JoinHostPort(origin, "*")),
|
|
)
|
|
}
|
|
|
|
origins = append(origins,
|
|
"app://*",
|
|
"file://*",
|
|
"tauri://*",
|
|
"vscode-webview://*",
|
|
)
|
|
|
|
return origins
|
|
}
|
|
|
|
// Models returns the path to the models directory. Models directory can be configured via the OLLAMA_MODELS environment variable.
|
|
// Default is $HOME/.ollama/models
|
|
func Models() string {
|
|
if s := Var("OLLAMA_MODELS"); s != "" {
|
|
return s
|
|
}
|
|
|
|
home, err := os.UserHomeDir()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
return filepath.Join(home, ".ollama", "models")
|
|
}
|
|
|
|
// KeepAlive returns the duration that models stay loaded in memory. KeepAlive can be configured via the OLLAMA_KEEP_ALIVE environment variable.
|
|
// Negative values are treated as infinite. Zero is treated as no keep alive.
|
|
// Default is 5 minutes.
|
|
func KeepAlive() (keepAlive time.Duration) {
|
|
keepAlive = 5 * time.Minute
|
|
if s := Var("OLLAMA_KEEP_ALIVE"); s != "" {
|
|
if d, err := time.ParseDuration(s); err == nil {
|
|
keepAlive = d
|
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
|
keepAlive = time.Duration(n) * time.Second
|
|
}
|
|
}
|
|
|
|
if keepAlive < 0 {
|
|
return time.Duration(math.MaxInt64)
|
|
}
|
|
|
|
return keepAlive
|
|
}
|
|
|
|
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
|
|
// Zero or Negative values are treated as infinite.
|
|
// Default is 5 minutes.
|
|
func LoadTimeout() (loadTimeout time.Duration) {
|
|
loadTimeout = 5 * time.Minute
|
|
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
|
|
if d, err := time.ParseDuration(s); err == nil {
|
|
loadTimeout = d
|
|
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
|
loadTimeout = time.Duration(n) * time.Second
|
|
}
|
|
}
|
|
|
|
if loadTimeout <= 0 {
|
|
return time.Duration(math.MaxInt64)
|
|
}
|
|
|
|
return loadTimeout
|
|
}
|
|
|
|
func Bool(k string) func() bool {
|
|
return func() bool {
|
|
if s := Var(k); s != "" {
|
|
b, err := strconv.ParseBool(s)
|
|
if err != nil {
|
|
return true
|
|
}
|
|
|
|
return b
|
|
}
|
|
|
|
return false
|
|
}
|
|
}
|
|
|
|
var (
|
|
// Debug enabled additional debug information.
|
|
Debug = Bool("OLLAMA_DEBUG")
|
|
// FlashAttention enables the experimental flash attention feature.
|
|
FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
|
|
// KvCacheType is the quantization type for the K/V cache.
|
|
KvCacheType = String("OLLAMA_KV_CACHE_TYPE")
|
|
// NoHistory disables readline history.
|
|
NoHistory = Bool("OLLAMA_NOHISTORY")
|
|
// NoPrune disables pruning of model blobs on startup.
|
|
NoPrune = Bool("OLLAMA_NOPRUNE")
|
|
// SchedSpread allows scheduling models across all GPUs.
|
|
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
|
// IntelGPU enables experimental Intel GPU detection.
|
|
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
|
// MultiUserCache optimizes prompt caching for multi-user scenarios
|
|
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
|
// Enable the new Ollama engine
|
|
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
|
)
|
|
|
|
func String(s string) func() string {
|
|
return func() string {
|
|
return Var(s)
|
|
}
|
|
}
|
|
|
|
var (
|
|
LLMLibrary = String("OLLAMA_LLM_LIBRARY")
|
|
|
|
CudaVisibleDevices = String("CUDA_VISIBLE_DEVICES")
|
|
HipVisibleDevices = String("HIP_VISIBLE_DEVICES")
|
|
RocrVisibleDevices = String("ROCR_VISIBLE_DEVICES")
|
|
GpuDeviceOrdinal = String("GPU_DEVICE_ORDINAL")
|
|
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
|
)
|
|
|
|
func Uint(key string, defaultValue uint) func() uint {
|
|
return func() uint {
|
|
if s := Var(key); s != "" {
|
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
} else {
|
|
return uint(n)
|
|
}
|
|
}
|
|
|
|
return defaultValue
|
|
}
|
|
}
|
|
|
|
var (
|
|
// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
|
|
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
|
|
// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
|
|
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
|
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
|
MaxQueue = Uint("OLLAMA_MAX_QUEUE", 512)
|
|
// MaxVRAM sets a maximum VRAM override in bytes. MaxVRAM can be configured via the OLLAMA_MAX_VRAM environment variable.
|
|
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
|
)
|
|
|
|
func Uint64(key string, defaultValue uint64) func() uint64 {
|
|
return func() uint64 {
|
|
if s := Var(key); s != "" {
|
|
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
|
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
} else {
|
|
return n
|
|
}
|
|
}
|
|
|
|
return defaultValue
|
|
}
|
|
}
|
|
|
|
// Set aside VRAM per GPU
|
|
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
|
|
|
type EnvVar struct {
|
|
Name string
|
|
Value any
|
|
Description string
|
|
}
|
|
|
|
func AsMap() map[string]EnvVar {
|
|
ret := map[string]EnvVar{
|
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
|
"OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"},
|
|
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
|
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
|
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
|
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
|
"OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
|
|
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
|
|
|
// Informational
|
|
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
|
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
|
|
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
|
|
}
|
|
|
|
if runtime.GOOS != "windows" {
|
|
// Windows environment variables are case-insensitive so there's no need to duplicate them
|
|
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
|
|
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
|
|
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
|
|
}
|
|
|
|
if runtime.GOOS != "darwin" {
|
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"}
|
|
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"}
|
|
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
|
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
|
}
|
|
|
|
return ret
|
|
}
|
|
|
|
func Values() map[string]string {
|
|
vals := make(map[string]string)
|
|
for k, v := range AsMap() {
|
|
vals[k] = fmt.Sprintf("%v", v.Value)
|
|
}
|
|
return vals
|
|
}
|
|
|
|
// Var returns an environment variable stripped of leading and trailing quotes or spaces
|
|
func Var(key string) string {
|
|
return strings.Trim(strings.TrimSpace(os.Getenv(key)), "\"'")
|
|
}
|