update memory calcualtions

count each layer independently when deciding gpu offloading
This commit is contained in:
Michael Yang
2024-03-18 10:45:22 +01:00
parent d338d70492
commit 91b3e4d282
7 changed files with 125 additions and 89 deletions

View File

@@ -68,7 +68,7 @@ var loaded struct {
var defaultSessionDuration = 5 * time.Minute
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
needLoad := loaded.runner == nil || // is there a model loaded?
loaded.ModelPath != model.ModelPath || // has the base model changed?
!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
@@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
loaded.Model = model
loaded.runner = llmRunner
loaded.Options = &opts
loaded.Options = opts
}
loaded.expireAt = time.Now().Add(sessionDuration)
@@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) {
sessionDuration = req.KeepAlive.Duration
}
if err := load(c, model, opts, sessionDuration); err != nil {
if err := load(c, model, &opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) {
sessionDuration = req.KeepAlive.Duration
}
if err := load(c, model, opts, sessionDuration); err != nil {
if err := load(c, model, &opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
@@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) {
sessionDuration = req.KeepAlive.Duration
}
if err := load(c, model, opts, sessionDuration); err != nil {
if err := load(c, model, &opts, sessionDuration); err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}