mirror of
https://github.com/ollama/ollama.git
synced 2025-12-08 03:22:05 +01:00
Request and model concurrency
This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
This commit is contained in:
@@ -8,9 +8,11 @@ void cpu_check_ram(mem_info_t *resp) {
|
||||
MEMORYSTATUSEX info;
|
||||
info.dwLength = sizeof(info);
|
||||
if (GlobalMemoryStatusEx(&info) != 0) {
|
||||
resp->count = 1;
|
||||
resp->total = info.ullTotalPhys;
|
||||
resp->free = info.ullAvailPhys;
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
||||
} else {
|
||||
resp->err = LOAD_ERR();
|
||||
}
|
||||
@@ -27,9 +29,11 @@ void cpu_check_ram(mem_info_t *resp) {
|
||||
if (sysinfo(&info) != 0) {
|
||||
resp->err = strdup(strerror(errno));
|
||||
} else {
|
||||
resp->count = 1;
|
||||
resp->total = info.totalram * info.mem_unit;
|
||||
resp->free = info.freeram * info.mem_unit;
|
||||
resp->major = 0;
|
||||
resp->minor = 0;
|
||||
snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user