mirror of
https://github.com/ollama/ollama.git
synced 2025-09-25 18:10:50 +02:00
llm: introduce k/v context quantization (vRAM improvements) (#6279)
This commit is contained in:
@@ -140,7 +140,7 @@ type ContextParams struct {
|
||||
c C.struct_llama_context_params
|
||||
}
|
||||
|
||||
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool) ContextParams {
|
||||
func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, flashAttention bool, kvCacheType string) ContextParams {
|
||||
params := C.llama_context_default_params()
|
||||
params.n_ctx = C.uint(numCtx)
|
||||
params.n_batch = C.uint(batchSize)
|
||||
@@ -149,9 +149,28 @@ func NewContextParams(numCtx int, batchSize int, numSeqMax int, threads int, fla
|
||||
params.n_threads_batch = params.n_threads
|
||||
params.embeddings = C.bool(true)
|
||||
params.flash_attn = C.bool(flashAttention)
|
||||
params.type_k = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
||||
params.type_v = kvCacheTypeFromStr(strings.ToLower(kvCacheType))
|
||||
|
||||
return ContextParams{c: params}
|
||||
}
|
||||
|
||||
// kvCacheTypeFromStr converts a string cache type to the corresponding GGML type value
|
||||
func kvCacheTypeFromStr(s string) C.enum_ggml_type {
|
||||
if s == "" {
|
||||
return C.GGML_TYPE_F16
|
||||
}
|
||||
|
||||
switch s {
|
||||
case "q8_0":
|
||||
return C.GGML_TYPE_Q8_0
|
||||
case "q4_0":
|
||||
return C.GGML_TYPE_Q4_0
|
||||
default:
|
||||
return C.GGML_TYPE_F16
|
||||
}
|
||||
}
|
||||
|
||||
type Context struct {
|
||||
c *C.struct_llama_context
|
||||
numThreads int
|
||||
|
@@ -850,6 +850,7 @@ func (s *Server) loadModel(
|
||||
lpath multiLPath,
|
||||
ppath string,
|
||||
kvSize int,
|
||||
kvCacheType string,
|
||||
flashAttention bool,
|
||||
threads int,
|
||||
multiUserCache bool,
|
||||
@@ -862,7 +863,7 @@ func (s *Server) loadModel(
|
||||
panic(err)
|
||||
}
|
||||
|
||||
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention)
|
||||
ctxParams := llama.NewContextParams(kvSize, s.batchSize*s.parallel, s.parallel, threads, flashAttention, kvCacheType)
|
||||
s.lc, err = llama.NewContextWithModel(s.model, ctxParams)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
@@ -903,6 +904,7 @@ func main() {
|
||||
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
||||
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
|
||||
kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
|
||||
kvCacheType := flag.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
|
||||
port := flag.Int("port", 8080, "Port to expose the server on")
|
||||
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
@@ -970,7 +972,7 @@ func main() {
|
||||
}
|
||||
|
||||
server.ready.Add(1)
|
||||
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
|
||||
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *kvCacheType, *flashAttention, *threads, *multiUserCache)
|
||||
|
||||
server.cond = sync.NewCond(&server.mu)
|
||||
|
||||
|
Reference in New Issue
Block a user