diff --git a/discover/runner.go b/discover/runner.go index 9ae5b3ffd4..65a542e2fd 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -27,7 +27,6 @@ var ( deviceMu sync.Mutex devices []ml.DeviceInfo libDirs map[string]struct{} - rocmDir string exe string bootstrapped bool ) @@ -61,14 +60,6 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. libDirs[filepath.Dir(file)] = struct{}{} } - // Our current packaging model places ggml-hip in the main directory - // but keeps rocm in an isolated directory. We have to add it to - // the [LD_LIBRARY_]PATH so ggml-hip will load properly - rocmDir = filepath.Join(ml.LibOllamaPath, "rocm") - if _, err := os.Stat(rocmDir); err != nil { - rocmDir = "" - } - if len(libDirs) == 0 { libDirs[""] = struct{}{} } @@ -82,9 +73,20 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. // are enumerated, but not actually supported. // We run this in serial to avoid potentially initializing a GPU multiple // times concurrently leading to memory contention - // TODO refactor so we group the lib dirs and do serial per version, but parallel for different libs for dir := range libDirs { + // Typically bootstrapping takes < 1s, but on some systems, with devices + // in low power/idle mode, initialization can take multiple seconds. We + // set a longer timeout just for bootstrap discovery to reduce the chance + // of giving up too quickly bootstrapTimeout := 30 * time.Second + if runtime.GOOS == "windows" { + // On Windows with Defender enabled, AV scanning of the DLLs + // takes place sequentially and this can significantly increase + // the time it takes too do the initial discovery pass. + // Subsequent loads will be faster as the scan results are + // cached + bootstrapTimeout = 90 * time.Second + } var dirs []string if dir != "" { if requested != "" && filepath.Base(dir) != requested { @@ -93,21 +95,11 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. } else if jetpack != "" && filepath.Base(dir) != "cuda_"+jetpack { continue } - } - if dir == "" { - dirs = []string{ml.LibOllamaPath} - } else { dirs = []string{ml.LibOllamaPath, dir} + } else { + dirs = []string{ml.LibOllamaPath} } - // ROCm can take a long time on some systems, so give it more time before giving up - if dir != "" && strings.Contains(filepath.Base(dir), "rocm") { - bootstrapTimeout = 60 * time.Second - } - // Typically bootstrapping takes < 1s, but on some systems, with devices - // in low power/idle mode, initialization can take multiple seconds. We - // set a long timeout just for bootstrap discovery to reduce the chance - // of giving up too quickly ctx1stPass, cancel := context.WithTimeout(ctx, bootstrapTimeout) defer cancel() @@ -117,6 +109,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. // In the second pass, we more deeply initialize the GPUs to weed out devices that // aren't supported by a given library. We run this phase in parallel to speed up discovery. + // Only devices that need verification are included in this pass slog.Debug("evluating which if any devices to filter out", "initial_count", len(devices)) ctx2ndPass, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() @@ -125,35 +118,16 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. supportedMu := sync.Mutex{} supported := make(map[string]map[string]map[string]int) // [Library][libDir][ID] = pre-deletion devices index for i := range devices { - libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1] - if devices[i].Library == "Metal" { + if !devices[i].NeedsInitValidation() { continue } - slog.Debug("verifying GPU is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID) + libDir := devices[i].LibraryPath[len(devices[i].LibraryPath)-1] + slog.Debug("verifying device is supported", "library", libDir, "description", devices[i].Description, "compute", devices[i].Compute(), "id", devices[i].ID, "pci_id", devices[i].PCIID) wg.Add(1) go func(i int) { defer wg.Done() - var envVar string - id := devices[i].ID - if devices[i].Library == "ROCm" { - if runtime.GOOS != "linux" { - envVar = "HIP_VISIBLE_DEVICES" - } else { - envVar = "ROCR_VISIBLE_DEVICES" - } - } else if devices[i].Library == "CUDA" { - envVar = "CUDA_VISIBLE_DEVICES" - } else if devices[i].Library == "Vulkan" { - id = devices[i].FilteredID - envVar = "GGML_VK_VISIBLE_DEVICES" - } else { - slog.Error("Unknown Library:" + devices[i].Library) - } - - extraEnvs := map[string]string{ - "GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs - envVar: id, // Filter to just this one GPU - } + extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1]) + devices[i].AddInitValidation(extraEnvs) if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 { slog.Debug("filtering device which didn't fully initialize", "id", devices[i].ID, @@ -178,26 +152,28 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. wg.Wait() logutil.Trace("supported GPU library combinations before filtering", "supported", supported) - filterOutVulkanThatAreSupportedByOtherGPU(needsDelete) - // Mark for deletion any overlaps - favoring the library version that can cover all GPUs if possible filterOverlapByLibrary(supported, needsDelete) - // TODO if we ever support multiple ROCm library versions this algorithm will need to be adjusted to keep the rocmID numeric value correct - rocmID := 0 + // Any Libraries that utilize numeric IDs need adjusting based on any possible filtering taking place + postFilteredID := map[string]int{} for i := 0; i < len(needsDelete); i++ { if needsDelete[i] { logutil.Trace("removing unsupported or overlapping GPU combination", "libDir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], "description", devices[i].Description, "compute", devices[i].Compute(), "pci_id", devices[i].PCIID) devices = append(devices[:i], devices[i+1:]...) needsDelete = append(needsDelete[:i], needsDelete[i+1:]...) i-- - } else if devices[i].Library == "ROCm" { + } else { + if _, ok := postFilteredID[devices[i].Library]; !ok { + postFilteredID[devices[i].Library] = 0 + } if _, err := strconv.Atoi(devices[i].ID); err == nil { // Replace the numeric ID with the post-filtered IDs - devices[i].FilteredID = devices[i].ID - devices[i].ID = strconv.Itoa(rocmID) + slog.Debug("adjusting filtering IDs", "FilterID", devices[i].ID, "new_ID", strconv.Itoa(postFilteredID[devices[i].Library])) + devices[i].FilterID = devices[i].ID + devices[i].ID = strconv.Itoa(postFilteredID[devices[i].Library]) } - rocmID++ + postFilteredID[devices[i].Library]++ } } @@ -214,7 +190,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. case ml.DuplicateDevice: // Different library, choose based on priority var droppedDevice ml.DeviceInfo - if devices[i].Library == "CUDA" || devices[i].Library == "ROCm" { + if devices[i].PreferredLibrary(devices[j]) { droppedDevice = devices[j] } else { droppedDevice = devices[i] @@ -363,38 +339,6 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. return devices } -func filterOutVulkanThatAreSupportedByOtherGPU(needsDelete []bool) { - // Filter out Vulkan devices that share a PCI ID with a non-Vulkan device that is not marked for deletion - for i := range devices { - if devices[i].Library != "Vulkan" || needsDelete[i] { - continue - } - if devices[i].PCIID == "" { - continue - } - for j := range devices { - if i == j { - continue - } - if devices[j].PCIID == "" { - continue - } - if devices[j].PCIID == devices[i].PCIID && devices[j].Library != "Vulkan" && !needsDelete[j] { - needsDelete[i] = true - slog.Debug("filtering device with duplicate PCI ID", - "id", devices[i].ID, - "library", devices[i].Library, - "libdir", devices[i].LibraryPath[len(devices[i].LibraryPath)-1], - "pci_id", devices[i].PCIID, - "kept_id", devices[j].ID, - "kept_library", devices[j].Library, - ) - break - } - } - } -} - func filterOverlapByLibrary(supported map[string]map[string]map[string]int, needsDelete []bool) { // For multi-GPU systems, use the newest version that supports all the GPUs for _, byLibDirs := range supported { diff --git a/discover/types.go b/discover/types.go index b1f622f4c6..efc69ecfd9 100644 --- a/discover/types.go +++ b/discover/types.go @@ -41,7 +41,7 @@ func LogDetails(devices []ml.DeviceInfo) { } slog.Info("inference compute", "id", dev.ID, - "filtered_id", dev.FilteredID, + "filter_id", dev.FilterID, "library", dev.Library, "compute", dev.Compute(), "name", dev.Name, diff --git a/llama/patches/0026-GPU-discovery-enhancements.patch b/llama/patches/0026-GPU-discovery-enhancements.patch index 807a468901..e5e68f3188 100644 --- a/llama/patches/0026-GPU-discovery-enhancements.patch +++ b/llama/patches/0026-GPU-discovery-enhancements.patch @@ -14,24 +14,24 @@ Vulkan PCI and Memory fix vulkan PCI ID and ID handling --- - ggml/include/ggml-backend.h | 8 + + ggml/include/ggml-backend.h | 6 + ggml/src/CMakeLists.txt | 2 + ggml/src/ggml-cuda/ggml-cuda.cu | 65 ++++ ggml/src/ggml-cuda/vendors/hip.h | 3 + ggml/src/ggml-impl.h | 8 + ggml/src/ggml-metal/ggml-metal.cpp | 2 + - ggml/src/ggml-vulkan/ggml-vulkan.cpp | 212 +++++++++++-- + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 209 +++++++++++-- ggml/src/mem_hip.cpp | 452 +++++++++++++++++++++++++++ ggml/src/mem_nvml.cpp | 209 +++++++++++++ - 9 files changed, 931 insertions(+), 30 deletions(-) + 9 files changed, 926 insertions(+), 30 deletions(-) create mode 100644 ggml/src/mem_hip.cpp create mode 100644 ggml/src/mem_nvml.cpp diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index ba181d09d..809835243 100644 +index 69223c488..6510e0cba 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h -@@ -169,6 +169,14 @@ extern "C" { +@@ -169,6 +169,12 @@ extern "C" { const char * device_id; // device capabilities struct ggml_backend_dev_caps caps; @@ -41,8 +41,6 @@ index ba181d09d..809835243 100644 + int compute_minor; + int integrated; + const char *library; -+ // number with which the devices are accessed (Vulkan) -+ const char *numeric_id; }; GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); @@ -60,7 +58,7 @@ index 0609c6503..aefe43bdd 100644 target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 87c6c34a4..b075a18be 100644 +index 5787e8cd5..d232bf828 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -261,6 +261,16 @@ static ggml_cuda_device_info ggml_cuda_init() { @@ -92,7 +90,7 @@ index 87c6c34a4..b075a18be 100644 GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no", ggml_cuda_parse_uuid(prop, id).c_str()); -@@ -3484,6 +3499,11 @@ struct ggml_backend_cuda_device_context { +@@ -3476,6 +3491,11 @@ struct ggml_backend_cuda_device_context { std::string description; std::string pci_bus_id; std::string id; @@ -104,7 +102,7 @@ index 87c6c34a4..b075a18be 100644 }; static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { -@@ -3504,6 +3524,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { +@@ -3496,6 +3516,28 @@ static const char * ggml_backend_cuda_device_get_id(ggml_backend_dev_t dev) { static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; ggml_cuda_set_device(ctx->device); @@ -133,7 +131,7 @@ index 87c6c34a4..b075a18be 100644 CUDA_CHECK(cudaMemGetInfo(free, total)); } -@@ -3512,6 +3554,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend +@@ -3504,6 +3546,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend return GGML_BACKEND_DEVICE_TYPE_GPU; } @@ -141,7 +139,7 @@ index 87c6c34a4..b075a18be 100644 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; -@@ -3525,6 +3568,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back +@@ -3517,6 +3560,19 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back // If you need the memory data, call ggml_backend_dev_memory() explicitly. props->memory_total = props->memory_free = 0; @@ -161,7 +159,7 @@ index 87c6c34a4..b075a18be 100644 bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr; #ifdef GGML_CUDA_NO_PEER_COPY bool events = false; -@@ -4087,6 +4143,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4079,6 +4135,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { std::lock_guard lock(mutex); if (!initialized) { ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context; @@ -169,7 +167,7 @@ index 87c6c34a4..b075a18be 100644 for (int i = 0; i < ggml_cuda_info().device_count; i++) { ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context; -@@ -4102,6 +4159,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { +@@ -4094,6 +4151,14 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID); dev_ctx->pci_bus_id = pci_bus_id; @@ -225,10 +223,10 @@ index d0fb3bcca..b63edd0c1 100644 } #endif diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp -index f2ff9f322..f356e4a0a 100644 +index 05ff6a5a6..032dee76d 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp -@@ -535,6 +535,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen +@@ -537,6 +537,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen GGML_UNUSED(dev); } @@ -236,7 +234,7 @@ index f2ff9f322..f356e4a0a 100644 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { props->name = ggml_backend_metal_device_get_name(dev); props->description = ggml_backend_metal_device_get_description(dev); -@@ -543,6 +544,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac +@@ -545,6 +546,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); @@ -245,7 +243,7 @@ index f2ff9f322..f356e4a0a 100644 /* .async = */ true, /* .host_buffer = */ false, diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index ed83236f4..0bbcecd01 100644 +index bd3ece516..7cfb14a54 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -231,6 +231,7 @@ class vk_memory_logger; @@ -286,7 +284,7 @@ index ed83236f4..0bbcecd01 100644 // backend interface #define UNUSED GGML_UNUSED -@@ -12391,31 +12415,103 @@ void ggml_backend_vk_get_device_description(int device, char * description, size +@@ -12392,31 +12416,102 @@ void ggml_backend_vk_get_device_description(int device, char * description, size ggml_vk_get_device_description(dev_idx, description, description_size); } @@ -309,7 +307,6 @@ index ed83236f4..0bbcecd01 100644 + std::string pci_id; + std::string id; + std::string uuid; -+ std::string numeric_id; + int major; + int minor; + int driver_major; @@ -407,7 +404,7 @@ index ed83236f4..0bbcecd01 100644 break; } } -@@ -12448,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { +@@ -12449,8 +12544,13 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { } } @@ -422,7 +419,7 @@ index ed83236f4..0bbcecd01 100644 } vk::PhysicalDeviceProperties2 props = {}; -@@ -12466,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { +@@ -12467,19 +12567,24 @@ static std::string ggml_backend_vk_get_device_pci_id(int device_idx) { char pci_bus_id[16] = {}; snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function); @@ -456,7 +453,7 @@ index ed83236f4..0bbcecd01 100644 static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; -@@ -12490,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de +@@ -12491,9 +12596,14 @@ static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t de return ctx->description.c_str(); } @@ -472,7 +469,7 @@ index ed83236f4..0bbcecd01 100644 } static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { -@@ -12516,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml +@@ -12517,8 +12627,9 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml props->name = ggml_backend_vk_device_get_name(dev); props->description = ggml_backend_vk_device_get_description(dev); @@ -483,7 +480,7 @@ index ed83236f4..0bbcecd01 100644 ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { /* .async = */ false, -@@ -12525,6 +12637,14 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml +@@ -12526,6 +12637,13 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml /* .buffer_from_host_ptr = */ false, /* .events = */ false, }; @@ -494,11 +491,10 @@ index ed83236f4..0bbcecd01 100644 + props->driver_minor = ctx->driver_minor; + props->integrated = ctx->is_integrated_gpu; + props->library = GGML_VK_NAME; -+ props->numeric_id = ctx->numeric_id.c_str(); } static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { -@@ -12953,6 +13073,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, +@@ -12954,6 +13072,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { @@ -507,7 +503,7 @@ index ed83236f4..0bbcecd01 100644 for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) { ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; char desc[256]; -@@ -12961,12 +13083,42 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, +@@ -12962,12 +13082,41 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, ctx->name = GGML_VK_NAME + std::to_string(i); ctx->description = desc; ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu; @@ -547,7 +543,6 @@ index ed83236f4..0bbcecd01 100644 + // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string + ctx->driver_major = 0; + ctx->driver_minor = 0; -+ ctx->numeric_id = std::to_string(i); } initialized = true; } diff --git a/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch b/llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch similarity index 98% rename from llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch rename to llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch index 8e5461cbb2..ebadc82b43 100644 --- a/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch +++ b/llama/patches/0030-Add-memory-detection-using-DXGI-PDH.patch @@ -38,7 +38,7 @@ index b63edd0c1..81cad8cf3 100644 #ifdef __cplusplus } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp -index cc68e7968..27d6574da 100644 +index 7cfb14a54..a1c46d0b3 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); @@ -54,10 +54,10 @@ index cc68e7968..27d6574da 100644 std::string id; std::string uuid; + std::string luid; - std::string numeric_id; int major; int minor; -@@ -12449,8 +12451,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size + int driver_major; +@@ -12448,8 +12450,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); vk::PhysicalDeviceProperties2 props2; vkdev.getProperties2(&props2); @@ -81,7 +81,7 @@ index cc68e7968..27d6574da 100644 { // Use vendor specific management libraries for best VRAM reporting if available switch (props2.properties.vendorID) { -@@ -12478,8 +12494,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size +@@ -12477,8 +12493,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size break; } } @@ -91,7 +91,7 @@ index cc68e7968..27d6574da 100644 *total = 0; *free = 0; vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props; -@@ -13091,7 +13107,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, +@@ -13089,7 +13105,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, /* .reg = */ reg, /* .context = */ ctx, }); @@ -99,7 +99,7 @@ index cc68e7968..27d6574da 100644 // Gather additional information about the device int dev_idx = vk_instance.device_indices[i]; vk::PhysicalDeviceProperties props1; -@@ -13114,6 +13129,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, +@@ -13112,6 +13127,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, } } ctx->uuid = oss.str(); diff --git a/llama/patches/0032-interleave-multi-rope.patch b/llama/patches/0031-interleave-multi-rope.patch similarity index 98% rename from llama/patches/0032-interleave-multi-rope.patch rename to llama/patches/0031-interleave-multi-rope.patch index eb41639e6a..6a8be51e74 100644 --- a/llama/patches/0032-interleave-multi-rope.patch +++ b/llama/patches/0031-interleave-multi-rope.patch @@ -1,6 +1,6 @@ From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Michael Yang -Date: Web, 16 Oct 2025 20:37:19 -0700 +Date: Thu, 16 Oct 2025 20:37:19 -0700 Subject: [PATCH] interleave multi rope since ollama doesn't use mrope for anything else, change it to mean the @@ -85,7 +85,7 @@ index 375a0c7fd..9866c96b4 100644 // end of mrope diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp -index 111286b49..6fc2b42f8 100644 +index 111286b49..633dc20ff 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp @@ -31,19 +31,13 @@ void main() { diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index c02926b359..5fa0a9ec0e 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -730,10 +730,6 @@ func (b *Backend) BackendDevices() []ml.DeviceInfo { info.PCIID = C.GoString(props.device_id) } info.LibraryPath = ggml.LibPaths() - if props.numeric_id != nil { - info.FilteredID = C.GoString(props.numeric_id) - } - C.ggml_backend_dev_memory(dev, &props.memory_free, &props.memory_total) info.TotalMemory = (uint64)(props.memory_total) info.FreeMemory = (uint64)(props.memory_free) diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 1cab4bb3f2..6510e0cba1 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -175,8 +175,6 @@ extern "C" { int compute_minor; int integrated; const char *library; - // number with which the devices are accessed (Vulkan) - const char *numeric_id; }; GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 27d6574da6..a1c46d0b3d 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -12435,7 +12435,6 @@ struct ggml_backend_vk_device_context { std::string id; std::string uuid; std::string luid; - std::string numeric_id; int major; int minor; int driver_major; @@ -12661,7 +12660,6 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml props->driver_minor = ctx->driver_minor; props->integrated = ctx->is_integrated_gpu; props->library = GGML_VK_NAME; - props->numeric_id = ctx->numeric_id.c_str(); } static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { @@ -13142,7 +13140,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string ctx->driver_major = 0; ctx->driver_minor = 0; - ctx->numeric_id = std::to_string(i); } initialized = true; } diff --git a/ml/device.go b/ml/device.go index 70e0c6a3d7..dc91359f40 100644 --- a/ml/device.go +++ b/ml/device.go @@ -257,7 +257,7 @@ type DeviceInfo struct { // FilterID is populated with the unfiltered device ID if a numeric ID is used // so the device can be included. - FilteredID string `json:"filtered_id,omitempty"` + FilterID string `json:"filter_id,omitempty"` // Integrated is set true for integrated GPUs, false for Discrete GPUs Integrated bool `json:"integration,omitempty"` @@ -455,6 +455,35 @@ func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string { return env } +// NeedsInitValidation returns true if the device in question has the potential +// to crash at inference time and requires deeper validation before we include +// it in the supported devices list. +func (d DeviceInfo) NeedsInitValidation() bool { + // At this time the only library we know needs a 2nd pass is ROCm since + // rocblas will crash on unsupported devices. We want to find those crashes + // during bootstrap discovery so we can eliminate those GPUs before the user + // tries to run inference on them + return d.Library == "ROCm" +} + +// Set the init validation environment variable +func (d DeviceInfo) AddInitValidation(env map[string]string) { + env["GGML_CUDA_INIT"] = "1" // force deep initialization to trigger crash on unsupported GPUs +} + +// PreferredLibrary returns true if this library is preferred over the other input +// library +// Used to filter out Vulkan in favor of CUDA or ROCm +func (d DeviceInfo) PreferredLibrary(other DeviceInfo) bool { + // TODO in the future if we find Vulkan is better than ROCm on some devices + // that implementation can live here. + + if d.Library == "CUDA" || d.Library == "ROCm" { + return true + } + return false +} + func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { var envVar string switch d.Library { @@ -472,8 +501,8 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { if existing { v = v + "," } - if d.FilteredID != "" { - v = v + d.FilteredID + if d.FilterID != "" { + v = v + d.FilterID } else { v = v + d.ID }