mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 11:37:23 +01:00
llm: speed up gguf decoding by a lot (#5246)
Previously, some costly things were causing the loading of GGUF files
and their metadata and tensor information to be VERY slow:
* Too many allocations when decoding strings
* Hitting disk for each read of each key and value, resulting in a
not-okay amount of syscalls/disk I/O.
The show API is now down to 33ms from 800ms+ for llama3 on a macbook pro
m3.
This commit also prevents collecting large arrays of values when
decoding GGUFs (if desired). When such keys are encountered, their
values are null, and are encoded as such in JSON.
Also, this fixes a broken test that was not encoding valid GGUF.
This commit is contained in:
@@ -754,7 +754,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
||||
}
|
||||
|
||||
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||
kvData, err := llm.LoadModel(digest)
|
||||
maxArraySize := 0
|
||||
if verbose {
|
||||
maxArraySize = -1
|
||||
}
|
||||
kvData, err := llm.LoadModel(digest, maxArraySize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1101,11 +1105,20 @@ func Serve(ln net.Listener) error {
|
||||
schedCtx, schedDone := context.WithCancel(ctx)
|
||||
sched := InitScheduler(schedCtx)
|
||||
s := &Server{addr: ln.Addr(), sched: sched}
|
||||
r := s.GenerateRoutes()
|
||||
|
||||
http.Handle("/", s.GenerateRoutes())
|
||||
|
||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||
srvr := &http.Server{
|
||||
Handler: r,
|
||||
// Use http.DefaultServeMux so we get net/http/pprof for
|
||||
// free.
|
||||
//
|
||||
// TODO(bmizerany): Decide if we want to make this
|
||||
// configurable so it is not exposed by default, or allow
|
||||
// users to bind it to a different port. This was a quick
|
||||
// and easy way to get pprof, but it may not be the best
|
||||
// way.
|
||||
Handler: nil,
|
||||
}
|
||||
|
||||
// listen for a ctrl+c and stop any loaded llm
|
||||
|
||||
Reference in New Issue
Block a user