allocate a large enough kv cache for all parallel requests (#4162)

This commit is contained in:
Jeffrey Morgan
2024-05-05 15:59:32 -07:00
committed by GitHub
parent 06164911dd
commit 942c979232

View File

@@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
// context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
// allocate a large enough kv cache for all parallel requests
opts.NumCtx = opts.NumCtx * numParallel
req := &LlmRequest{
ctx: c,
model: model,
@@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
successCh: make(chan *runnerRef),
errCh: make(chan error, 1),
}
// context split across parallel threads
opts.NumCtx = opts.NumCtx * numParallel
select {
case s.pendingReqCh <- req:
default: