From 220e133fca8fe128dbf8fecef96c8484f991e39c Mon Sep 17 00:00:00 2001 From: virajwad <84867530+virajwad@users.noreply.github.com> Date: Tue, 4 Nov 2025 15:11:55 -0700 Subject: [PATCH] vulkan: Add memory detection for Intel GPU using DXGI+PDH (#12664) * PDH free memory skeleton * Add PDH printing * Add LUID support for Vulkan * wire luid from ggml-vulkan to mem-dxgi-pdh file * Fix to ggml-impl * Continue skeleton * Implemented ggml_dxgi_pdh_get_device_memory * fix comments * Fix - change value GB to bytes * add ifdefs to only support windows and not linux * modify error codes * Finished ggml_dxgi_pdh_init() function * completed ggml_dxgi_pdh_release() * Formatting changes, add static to functions * fix build errors * fix go build error * fix luid - now should match between dxgi and vulkan * Fix the free memory reporting (was using copy by value, change to reference) * keep only dxgi1_2.h * Modifications based on PR feedback * fix merge conflicts (2) and fix desc1.description printout * move dxgi + pdh api calls to before the vendor specific library calls * change from 3 samples to 1 sample for PDH * modify when old_mode is set * add fix for building MacOS * fix release and returns for other vendors * add patch file --- ...-Add-memory-detection-using-DXGI-PDH.patch | 420 ++++++++++++++++++ ml/backend/ggml/ggml/src/CMakeLists.txt | 1 + ml/backend/ggml/ggml/src/ggml-impl.h | 3 + .../ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp | 29 +- ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp | 297 +++++++++++++ 5 files changed, 747 insertions(+), 3 deletions(-) create mode 100644 llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch create mode 100644 ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp diff --git a/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch b/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch new file mode 100644 index 0000000000..8e5461cbb2 --- /dev/null +++ b/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch @@ -0,0 +1,420 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Viraj Wadhwa +Date: Tue, 4 Nov 2025 12:04:04 -0800 +Subject: [PATCH] Add memory detection using DXGI + PDH + +--- + ggml/src/CMakeLists.txt | 1 + + ggml/src/ggml-impl.h | 3 + + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 29 ++- + ggml/src/mem_dxgi_pdh.cpp | 297 +++++++++++++++++++++++++++ + 4 files changed, 327 insertions(+), 3 deletions(-) + create mode 100644 ggml/src/mem_dxgi_pdh.cpp + +diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt +index aefe43bdd..21fe4640c 100644 +--- a/ggml/src/CMakeLists.txt ++++ b/ggml/src/CMakeLists.txt +@@ -211,6 +211,7 @@ add_library(ggml-base + ggml-quants.h + mem_hip.cpp + mem_nvml.cpp ++ mem_dxgi_pdh.cpp + gguf.cpp) + + target_include_directories(ggml-base PRIVATE .) +diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h +index b63edd0c1..81cad8cf3 100644 +--- a/ggml/src/ggml-impl.h ++++ b/ggml/src/ggml-impl.h +@@ -645,6 +645,9 @@ GGML_API void ggml_nvml_release(); + GGML_API int ggml_hip_mgmt_init(); + GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total); + GGML_API void ggml_hip_mgmt_release(); ++GGML_API int ggml_dxgi_pdh_init(); ++GGML_API int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu); ++GGML_API void ggml_dxgi_pdh_release(); + + #ifdef __cplusplus + } +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index cc68e7968..27d6574da 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); + #define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16" + #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000) + #define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000) ++#define VK_LUID_SIZE_KHR VK_LUID_SIZE + + typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR { + VkStructureType sType; +@@ -12433,6 +12434,7 @@ struct ggml_backend_vk_device_context { + std::string pci_id; + std::string id; + std::string uuid; ++ std::string luid; + std::string numeric_id; + int major; + int minor; +@@ -12449,8 +12451,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size + vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); + vk::PhysicalDeviceProperties2 props2; + vkdev.getProperties2(&props2); ++ GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: uuid %s\n", ctx->uuid.c_str()); ++ GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: luid %s\n", ctx->luid.c_str()); ++ ++ // Check VRAM reporting for Windows IGPU/DGPU using DXGI + PDH (vendor agnostic) ++ if (ggml_dxgi_pdh_init() == 0) { ++ GGML_LOG_DEBUG("DXGI + PDH Initialized. Getting GPU free memory info\n"); ++ int status = ggml_dxgi_pdh_get_device_memory(ctx->luid.c_str(), free, total, ctx->is_integrated_gpu); ++ if (status == 0) { ++ GGML_LOG_DEBUG("%s utilizing DXGI + PDH memory reporting free: %zu total: %zu\n", __func__, *free, *total); ++ ggml_dxgi_pdh_release(); ++ return; ++ } ++ ggml_dxgi_pdh_release(); ++ } + +- if (!ctx->is_integrated_gpu) ++ if (!ctx->is_integrated_gpu) + { + // Use vendor specific management libraries for best VRAM reporting if available + switch (props2.properties.vendorID) { +@@ -12478,8 +12494,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size + break; + } + } +- // else fallback to memory budget if supported + ++ // else fallback to memory budget if supported + *total = 0; + *free = 0; + vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props; +@@ -13091,7 +13107,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, + /* .reg = */ reg, + /* .context = */ ctx, + }); +- + // Gather additional information about the device + int dev_idx = vk_instance.device_indices[i]; + vk::PhysicalDeviceProperties props1; +@@ -13114,6 +13129,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, + } + } + ctx->uuid = oss.str(); ++ const auto& luid = device_id_props.deviceLUID; ++ char luid_str[32]; // "0x" + 16 hex digits + null terminator = 19 chars ++ snprintf(luid_str, sizeof(luid_str), // high part + low part ++ "0x%02x%02x%02x%02x%02x%02x%02x%02x", ++ luid[7], luid[6], luid[5], luid[4], ++ luid[3], luid[2], luid[1], luid[0] ++ ); ++ ctx->luid = std::string(luid_str); + ctx->major = 0; + ctx->minor = 0; + // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string +diff --git a/ggml/src/mem_dxgi_pdh.cpp b/ggml/src/mem_dxgi_pdh.cpp +new file mode 100644 +index 000000000..2f395761c +--- /dev/null ++++ b/ggml/src/mem_dxgi_pdh.cpp +@@ -0,0 +1,297 @@ ++// DXGI and PDH Performance Counters Library ++// This Windows-only (10/11) library provides accurate VRAM reporting ++#include "ggml.h" ++#include "ggml-impl.h" ++ ++#ifdef _WIN32 ++# define WIN32_LEAN_AND_MEAN ++# ifndef NOMINMAX ++# define NOMINMAX ++# endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++namespace fs = std::filesystem; ++ ++static std::mutex ggml_dxgi_pdh_lock; ++ ++/* ++Struct to keep track of GPU adapter information at runtime ++*/ ++struct GpuInfo { ++ std::wstring description; // debug field ++ LUID luid; ++ std::wstring pdhInstance; ++ double dedicatedTotal = 0.0; ++ double sharedTotal = 0.0; ++ double dedicatedUsage = 0.0; ++ double sharedUsage = 0.0; ++}; ++ ++/* ++DLL Function Pointers ++*/ ++struct { ++ void *dxgi_dll_handle; ++ void *pdh_dll_handle; ++ // DXGI Functions ++ HRESULT (*CreateDXGIFactory1)(REFIID riid, void **ppFactory); ++ // PDH functions ++ PDH_STATUS (*PdhOpenQueryW)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery); ++ PDH_STATUS (*PdhAddCounterW)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter); ++ PDH_STATUS (*PdhCollectQueryData)(PDH_HQUERY hQuery); ++ PDH_STATUS (*PdhGetFormattedCounterValue)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue); ++ PDH_STATUS (*PdhCloseQuery)(PDH_HQUERY hQuery); ++} dll_functions { ++ nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr ++}; ++ ++/* ++Create a PDH Instance name ++*/ ++static std::wstring generate_pdh_instance_name_from_luid(const LUID& luid) { ++ std::wstringstream ss; ++ ss << L"luid_0x" << std::hex << std::setw(8) << std::setfill(L'0') << std::uppercase << luid.HighPart ++ << L"_0x" << std::setw(8) << std::setfill(L'0') << luid.LowPart; ++ return ss.str(); ++} ++ ++/* ++Conversion from Bytes to GigaBytes ++*/ ++template ++static inline double b_to_gb(T n) ++{ ++ return (double(n) / (1024.0 * 1024 * 1024)); ++} ++ ++/* ++Fetch the GPU adapter 'dedicated memory' and 'shared memory' using DXGI ++*/ ++static void fetch_dxgi_adapter_desc1(const DXGI_ADAPTER_DESC1& desc, GpuInfo* info) { ++ auto dedicatedVideoMemory = desc.DedicatedVideoMemory; ++ auto sharedSystemMemory = desc.SharedSystemMemory; ++ GGML_LOG_DEBUG("[DXGI] Adapter Description: %ls, LUID: 0x%08X%08X, Dedicated: %.2f GB, Shared: %.2f GB\n", desc.Description, desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart, b_to_gb(dedicatedVideoMemory), b_to_gb(sharedSystemMemory)); ++ if (info) { ++ info->dedicatedTotal = dedicatedVideoMemory; // values in bytes ++ info->sharedTotal = sharedSystemMemory; ++ } ++} ++ ++/* ++Enumerate over the GPU adapters detected using DXGI and return their information ++*/ ++static std::vector get_dxgi_gpu_infos() { ++ std::vector infos; ++ IDXGIFactory1* pFactory = nullptr; ++ ++ if (SUCCEEDED(dll_functions.CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory))) { ++ UINT i = 0; ++ IDXGIAdapter1* pAdapter = nullptr; ++ while (pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND) { ++ DXGI_ADAPTER_DESC1 desc; ++ pAdapter->GetDesc1(&desc); ++ ++ // Get all the GPU adapter info ++ GpuInfo info; ++ fetch_dxgi_adapter_desc1(desc, &info); ++ info.description = std::wstring(desc.Description); ++ info.luid = desc.AdapterLuid; ++ info.pdhInstance = generate_pdh_instance_name_from_luid(desc.AdapterLuid); ++ infos.push_back(info); ++ ++ pAdapter->Release(); ++ ++i; ++ } ++ pFactory->Release(); ++ } ++ return infos; ++} ++ ++static bool get_gpu_memory_usage(GpuInfo& gpu) { ++ PDH_HQUERY query; ++ if (dll_functions.PdhOpenQueryW(NULL, 0, &query) != ERROR_SUCCESS) { ++ return false; ++ } ++ ++ struct GpuCounters { ++ PDH_HCOUNTER dedicated; ++ PDH_HCOUNTER shared; ++ }; ++ ++ GpuCounters gpuCounter{}; ++ ++ std::wstring dedicatedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Dedicated Usage"; ++ std::wstring sharedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Shared Usage"; ++ ++ if (dll_functions.PdhAddCounterW(query, dedicatedPath.c_str(), 0, &gpuCounter.dedicated) != ERROR_SUCCESS || ++ dll_functions.PdhAddCounterW(query, sharedPath.c_str(), 0, &gpuCounter.shared) != ERROR_SUCCESS) { ++ GGML_LOG_ERROR("Failed to add PDH counters for GPU %s\n", std::string(gpu.pdhInstance.begin(), gpu.pdhInstance.end()).c_str()); ++ dll_functions.PdhCloseQuery(query); ++ return false; ++ } ++ ++ // Sample the data ++ if (dll_functions.PdhCollectQueryData(query) != ERROR_SUCCESS) { ++ dll_functions.PdhCloseQuery(query); ++ return false; ++ } ++ ++ // Read final values ++ PDH_FMT_COUNTERVALUE val; ++ ++ if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.dedicated, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS) ++ gpu.dedicatedUsage = val.doubleValue; ++ ++ if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.shared, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS) ++ gpu.sharedUsage = val.doubleValue; ++ ++ dll_functions.PdhCloseQuery(query); ++ return true; ++} ++ ++ ++extern "C" { ++ ++ int ggml_dxgi_pdh_init() { ++ GGML_LOG_DEBUG("%s called\n", __func__); ++ std::lock_guard lock(ggml_dxgi_pdh_lock); ++ if (dll_functions.dxgi_dll_handle != NULL && dll_functions.pdh_dll_handle != NULL) { ++ // Already initialized as we have both DLL handles ++ return ERROR_SUCCESS; ++ } ++ ++ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); ++ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); ++ fs::path libPath_dxgi = fs::path("\\Windows") / fs::path("System32") / fs::path("dxgi.dll"); ++ fs::path libPath_pdh = fs::path("\\Windows") / fs::path("System32") / fs::path("pdh.dll"); ++ ++ // Call LoadLibraryW on both DLLs to ensure they are loaded ++ void *dxgi = (void*)LoadLibraryW(libPath_dxgi.wstring().c_str()); ++ void *pdh = (void*)LoadLibraryW(libPath_pdh.wstring().c_str()); ++ if(dxgi == NULL || pdh == NULL) { ++ if (dxgi != NULL) { ++ FreeLibrary((HMODULE)(dxgi)); ++ } ++ if (pdh != NULL) { ++ FreeLibrary((HMODULE)(pdh)); ++ } ++ SetErrorMode(old_mode); ++ return ERROR_DLL_NOT_FOUND; ++ } ++ else { ++ // save the dll handles ++ dll_functions.dxgi_dll_handle = dxgi; ++ dll_functions.pdh_dll_handle = pdh; ++ } ++ ++ // Get pointers to the library functions loaded by the DLLs ++ dll_functions.CreateDXGIFactory1 = (HRESULT (*)(REFIID riid, void **ppFactory)) GetProcAddress((HMODULE)(dll_functions.dxgi_dll_handle), "CreateDXGIFactory1"); ++ dll_functions.PdhOpenQueryW = (PDH_STATUS (*)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhOpenQueryW"); ++ dll_functions.PdhAddCounterW = (PDH_STATUS (*)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhAddCounterW"); ++ dll_functions.PdhCollectQueryData = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCollectQueryData"); ++ dll_functions.PdhGetFormattedCounterValue = (PDH_STATUS (*)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhGetFormattedCounterValue"); ++ dll_functions.PdhCloseQuery = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCloseQuery"); ++ ++ SetErrorMode(old_mode); // set old mode before any return ++ ++ // Check if any function pointers are NULL (not found) ++ if (dll_functions.CreateDXGIFactory1 == NULL || dll_functions.PdhOpenQueryW == NULL || dll_functions.PdhAddCounterW == NULL || dll_functions.PdhCollectQueryData == NULL || dll_functions.PdhGetFormattedCounterValue == NULL || dll_functions.PdhCloseQuery == NULL) { ++ GGML_LOG_INFO("%s unable to locate required symbols in either dxgi.dll or pdh.dll", __func__); ++ FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle)); ++ FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle)); ++ dll_functions.dxgi_dll_handle = NULL; ++ dll_functions.pdh_dll_handle = NULL; ++ return ERROR_PROC_NOT_FOUND; ++ } ++ ++ // No other initializations needed, successfully loaded the libraries and functions! ++ return ERROR_SUCCESS; ++ } ++ ++ void ggml_dxgi_pdh_release() { ++ std::lock_guard lock(ggml_dxgi_pdh_lock); ++ if (dll_functions.dxgi_dll_handle == NULL && dll_functions.pdh_dll_handle == NULL) { ++ // Already freed the DLLs ++ return; ++ } ++ ++ // Call FreeLibrary on both DLLs ++ FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle)); ++ FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle)); ++ ++ dll_functions.dxgi_dll_handle = NULL; ++ dll_functions.pdh_dll_handle = NULL; ++ ++ return; // successfully released ++ } ++ ++ int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) { ++ ++ std::lock_guard lock(ggml_dxgi_pdh_lock); ++ ++ // Enumerate GPUs using DXGI and find the matching LUID ++ // This also fetches the total memory info for each of the enumerated GPUs ++ std::vector gpus = get_dxgi_gpu_infos(); ++ GpuInfo *targetGpu = nullptr; ++ for (auto& gpu : gpus) { ++ char luid_buffer[32]; // "0x" + 16 hex digits + null terminator ++ snprintf(luid_buffer, sizeof(luid_buffer), "0x%08x%08x", gpu.luid.HighPart, gpu.luid.LowPart); ++ std::string gpu_luid_str(luid_buffer); ++ if (gpu_luid_str == std::string(luid)) { ++ targetGpu = &gpu; ++ break; ++ } ++ } ++ if (!targetGpu) { ++ GGML_LOG_ERROR("GPU with LUID %s not found.\n", luid); ++ return ERROR_NOT_FOUND; ++ } ++ ++ // Get the current memory usage for the target GPU ++ int status = get_gpu_memory_usage(*targetGpu); ++ if (!status) { ++ GGML_LOG_ERROR("Failed to get GPU memory usage.\n"); ++ return ERROR_DEVICE_NOT_AVAILABLE; ++ } ++ ++ // Calculate the free memory based on whether it's an integrated or discrete GPU ++ if (is_integrated_gpu) { ++ // IGPU free = SharedTotal - SharedUsage ++ GGML_LOG_DEBUG("Integrated GPU (%ls) with LUID %s detected. Shared Total: %.2f bytes (%.2f GB), Shared Usage: %.2f bytes (%.2f GB), Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->sharedTotal, b_to_gb(targetGpu->sharedTotal), targetGpu->sharedUsage, b_to_gb(targetGpu->sharedUsage), targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage)); ++ *free = (targetGpu->sharedTotal - targetGpu->sharedUsage) + (targetGpu->dedicatedTotal - targetGpu->dedicatedUsage); // Some IGPUs also have dedicated memory, which can be used along with the IGPU's shared memory ++ *total = targetGpu->sharedTotal + targetGpu->dedicatedTotal; ++ } ++ else { ++ // DGPU free = DedicatedTotal - DedicatedUsage ++ GGML_LOG_DEBUG("Discrete GPU (%ls) with LUID %s detected. Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage)); ++ *free = targetGpu->dedicatedTotal - targetGpu->dedicatedUsage; ++ *total = targetGpu->dedicatedTotal; ++ } ++ ++ return ERROR_SUCCESS; ++ } ++ ++} // extern "C" ++ ++#else // #ifdef _WIN32 ++ ++extern "C" { ++ ++ // DXGI + PDH not available for Linux implementation ++ int ggml_dxgi_pdh_init() { ++ return -1; ++ } ++ void ggml_dxgi_pdh_release() {} ++ int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) { ++ return -1; ++ } ++ ++} // extern "C" ++ ++#endif // #ifdef _WIN32 +\ No newline at end of file diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt index aefe43bdd5..21fe4640c1 100644 --- a/ml/backend/ggml/ggml/src/CMakeLists.txt +++ b/ml/backend/ggml/ggml/src/CMakeLists.txt @@ -211,6 +211,7 @@ add_library(ggml-base ggml-quants.h mem_hip.cpp mem_nvml.cpp + mem_dxgi_pdh.cpp gguf.cpp) target_include_directories(ggml-base PRIVATE .) diff --git a/ml/backend/ggml/ggml/src/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h index b63edd0c14..81cad8cf33 100644 --- a/ml/backend/ggml/ggml/src/ggml-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-impl.h @@ -645,6 +645,9 @@ GGML_API void ggml_nvml_release(); GGML_API int ggml_hip_mgmt_init(); GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total); GGML_API void ggml_hip_mgmt_release(); +GGML_API int ggml_dxgi_pdh_init(); +GGML_API int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu); +GGML_API void ggml_dxgi_pdh_release(); #ifdef __cplusplus } diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp index cc68e79686..27d6574da6 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher(); #define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME "VK_KHR_shader_bfloat16" #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000) #define VK_COMPONENT_TYPE_BFLOAT16_KHR ((VkComponentTypeKHR)1000141000) +#define VK_LUID_SIZE_KHR VK_LUID_SIZE typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR { VkStructureType sType; @@ -12433,6 +12434,7 @@ struct ggml_backend_vk_device_context { std::string pci_id; std::string id; std::string uuid; + std::string luid; std::string numeric_id; int major; int minor; @@ -12449,8 +12451,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); vk::PhysicalDeviceProperties2 props2; vkdev.getProperties2(&props2); + GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: uuid %s\n", ctx->uuid.c_str()); + GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: luid %s\n", ctx->luid.c_str()); - if (!ctx->is_integrated_gpu) + // Check VRAM reporting for Windows IGPU/DGPU using DXGI + PDH (vendor agnostic) + if (ggml_dxgi_pdh_init() == 0) { + GGML_LOG_DEBUG("DXGI + PDH Initialized. Getting GPU free memory info\n"); + int status = ggml_dxgi_pdh_get_device_memory(ctx->luid.c_str(), free, total, ctx->is_integrated_gpu); + if (status == 0) { + GGML_LOG_DEBUG("%s utilizing DXGI + PDH memory reporting free: %zu total: %zu\n", __func__, *free, *total); + ggml_dxgi_pdh_release(); + return; + } + ggml_dxgi_pdh_release(); + } + + if (!ctx->is_integrated_gpu) { // Use vendor specific management libraries for best VRAM reporting if available switch (props2.properties.vendorID) { @@ -12478,8 +12494,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size break; } } - // else fallback to memory budget if supported + // else fallback to memory budget if supported *total = 0; *free = 0; vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props; @@ -13091,7 +13107,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, /* .reg = */ reg, /* .context = */ ctx, }); - // Gather additional information about the device int dev_idx = vk_instance.device_indices[i]; vk::PhysicalDeviceProperties props1; @@ -13114,6 +13129,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, } } ctx->uuid = oss.str(); + const auto& luid = device_id_props.deviceLUID; + char luid_str[32]; // "0x" + 16 hex digits + null terminator = 19 chars + snprintf(luid_str, sizeof(luid_str), // high part + low part + "0x%02x%02x%02x%02x%02x%02x%02x%02x", + luid[7], luid[6], luid[5], luid[4], + luid[3], luid[2], luid[1], luid[0] + ); + ctx->luid = std::string(luid_str); ctx->major = 0; ctx->minor = 0; // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string diff --git a/ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp b/ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp new file mode 100644 index 0000000000..2f395761c5 --- /dev/null +++ b/ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp @@ -0,0 +1,297 @@ +// DXGI and PDH Performance Counters Library +// This Windows-only (10/11) library provides accurate VRAM reporting +#include "ggml.h" +#include "ggml-impl.h" + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +static std::mutex ggml_dxgi_pdh_lock; + +/* +Struct to keep track of GPU adapter information at runtime +*/ +struct GpuInfo { + std::wstring description; // debug field + LUID luid; + std::wstring pdhInstance; + double dedicatedTotal = 0.0; + double sharedTotal = 0.0; + double dedicatedUsage = 0.0; + double sharedUsage = 0.0; +}; + +/* +DLL Function Pointers +*/ +struct { + void *dxgi_dll_handle; + void *pdh_dll_handle; + // DXGI Functions + HRESULT (*CreateDXGIFactory1)(REFIID riid, void **ppFactory); + // PDH functions + PDH_STATUS (*PdhOpenQueryW)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery); + PDH_STATUS (*PdhAddCounterW)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter); + PDH_STATUS (*PdhCollectQueryData)(PDH_HQUERY hQuery); + PDH_STATUS (*PdhGetFormattedCounterValue)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue); + PDH_STATUS (*PdhCloseQuery)(PDH_HQUERY hQuery); +} dll_functions { + nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr +}; + +/* +Create a PDH Instance name +*/ +static std::wstring generate_pdh_instance_name_from_luid(const LUID& luid) { + std::wstringstream ss; + ss << L"luid_0x" << std::hex << std::setw(8) << std::setfill(L'0') << std::uppercase << luid.HighPart + << L"_0x" << std::setw(8) << std::setfill(L'0') << luid.LowPart; + return ss.str(); +} + +/* +Conversion from Bytes to GigaBytes +*/ +template +static inline double b_to_gb(T n) +{ + return (double(n) / (1024.0 * 1024 * 1024)); +} + +/* +Fetch the GPU adapter 'dedicated memory' and 'shared memory' using DXGI +*/ +static void fetch_dxgi_adapter_desc1(const DXGI_ADAPTER_DESC1& desc, GpuInfo* info) { + auto dedicatedVideoMemory = desc.DedicatedVideoMemory; + auto sharedSystemMemory = desc.SharedSystemMemory; + GGML_LOG_DEBUG("[DXGI] Adapter Description: %ls, LUID: 0x%08X%08X, Dedicated: %.2f GB, Shared: %.2f GB\n", desc.Description, desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart, b_to_gb(dedicatedVideoMemory), b_to_gb(sharedSystemMemory)); + if (info) { + info->dedicatedTotal = dedicatedVideoMemory; // values in bytes + info->sharedTotal = sharedSystemMemory; + } +} + +/* +Enumerate over the GPU adapters detected using DXGI and return their information +*/ +static std::vector get_dxgi_gpu_infos() { + std::vector infos; + IDXGIFactory1* pFactory = nullptr; + + if (SUCCEEDED(dll_functions.CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory))) { + UINT i = 0; + IDXGIAdapter1* pAdapter = nullptr; + while (pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND) { + DXGI_ADAPTER_DESC1 desc; + pAdapter->GetDesc1(&desc); + + // Get all the GPU adapter info + GpuInfo info; + fetch_dxgi_adapter_desc1(desc, &info); + info.description = std::wstring(desc.Description); + info.luid = desc.AdapterLuid; + info.pdhInstance = generate_pdh_instance_name_from_luid(desc.AdapterLuid); + infos.push_back(info); + + pAdapter->Release(); + ++i; + } + pFactory->Release(); + } + return infos; +} + +static bool get_gpu_memory_usage(GpuInfo& gpu) { + PDH_HQUERY query; + if (dll_functions.PdhOpenQueryW(NULL, 0, &query) != ERROR_SUCCESS) { + return false; + } + + struct GpuCounters { + PDH_HCOUNTER dedicated; + PDH_HCOUNTER shared; + }; + + GpuCounters gpuCounter{}; + + std::wstring dedicatedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Dedicated Usage"; + std::wstring sharedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Shared Usage"; + + if (dll_functions.PdhAddCounterW(query, dedicatedPath.c_str(), 0, &gpuCounter.dedicated) != ERROR_SUCCESS || + dll_functions.PdhAddCounterW(query, sharedPath.c_str(), 0, &gpuCounter.shared) != ERROR_SUCCESS) { + GGML_LOG_ERROR("Failed to add PDH counters for GPU %s\n", std::string(gpu.pdhInstance.begin(), gpu.pdhInstance.end()).c_str()); + dll_functions.PdhCloseQuery(query); + return false; + } + + // Sample the data + if (dll_functions.PdhCollectQueryData(query) != ERROR_SUCCESS) { + dll_functions.PdhCloseQuery(query); + return false; + } + + // Read final values + PDH_FMT_COUNTERVALUE val; + + if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.dedicated, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS) + gpu.dedicatedUsage = val.doubleValue; + + if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.shared, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS) + gpu.sharedUsage = val.doubleValue; + + dll_functions.PdhCloseQuery(query); + return true; +} + + +extern "C" { + + int ggml_dxgi_pdh_init() { + GGML_LOG_DEBUG("%s called\n", __func__); + std::lock_guard lock(ggml_dxgi_pdh_lock); + if (dll_functions.dxgi_dll_handle != NULL && dll_functions.pdh_dll_handle != NULL) { + // Already initialized as we have both DLL handles + return ERROR_SUCCESS; + } + + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + fs::path libPath_dxgi = fs::path("\\Windows") / fs::path("System32") / fs::path("dxgi.dll"); + fs::path libPath_pdh = fs::path("\\Windows") / fs::path("System32") / fs::path("pdh.dll"); + + // Call LoadLibraryW on both DLLs to ensure they are loaded + void *dxgi = (void*)LoadLibraryW(libPath_dxgi.wstring().c_str()); + void *pdh = (void*)LoadLibraryW(libPath_pdh.wstring().c_str()); + if(dxgi == NULL || pdh == NULL) { + if (dxgi != NULL) { + FreeLibrary((HMODULE)(dxgi)); + } + if (pdh != NULL) { + FreeLibrary((HMODULE)(pdh)); + } + SetErrorMode(old_mode); + return ERROR_DLL_NOT_FOUND; + } + else { + // save the dll handles + dll_functions.dxgi_dll_handle = dxgi; + dll_functions.pdh_dll_handle = pdh; + } + + // Get pointers to the library functions loaded by the DLLs + dll_functions.CreateDXGIFactory1 = (HRESULT (*)(REFIID riid, void **ppFactory)) GetProcAddress((HMODULE)(dll_functions.dxgi_dll_handle), "CreateDXGIFactory1"); + dll_functions.PdhOpenQueryW = (PDH_STATUS (*)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhOpenQueryW"); + dll_functions.PdhAddCounterW = (PDH_STATUS (*)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhAddCounterW"); + dll_functions.PdhCollectQueryData = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCollectQueryData"); + dll_functions.PdhGetFormattedCounterValue = (PDH_STATUS (*)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhGetFormattedCounterValue"); + dll_functions.PdhCloseQuery = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCloseQuery"); + + SetErrorMode(old_mode); // set old mode before any return + + // Check if any function pointers are NULL (not found) + if (dll_functions.CreateDXGIFactory1 == NULL || dll_functions.PdhOpenQueryW == NULL || dll_functions.PdhAddCounterW == NULL || dll_functions.PdhCollectQueryData == NULL || dll_functions.PdhGetFormattedCounterValue == NULL || dll_functions.PdhCloseQuery == NULL) { + GGML_LOG_INFO("%s unable to locate required symbols in either dxgi.dll or pdh.dll", __func__); + FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle)); + FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle)); + dll_functions.dxgi_dll_handle = NULL; + dll_functions.pdh_dll_handle = NULL; + return ERROR_PROC_NOT_FOUND; + } + + // No other initializations needed, successfully loaded the libraries and functions! + return ERROR_SUCCESS; + } + + void ggml_dxgi_pdh_release() { + std::lock_guard lock(ggml_dxgi_pdh_lock); + if (dll_functions.dxgi_dll_handle == NULL && dll_functions.pdh_dll_handle == NULL) { + // Already freed the DLLs + return; + } + + // Call FreeLibrary on both DLLs + FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle)); + FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle)); + + dll_functions.dxgi_dll_handle = NULL; + dll_functions.pdh_dll_handle = NULL; + + return; // successfully released + } + + int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) { + + std::lock_guard lock(ggml_dxgi_pdh_lock); + + // Enumerate GPUs using DXGI and find the matching LUID + // This also fetches the total memory info for each of the enumerated GPUs + std::vector gpus = get_dxgi_gpu_infos(); + GpuInfo *targetGpu = nullptr; + for (auto& gpu : gpus) { + char luid_buffer[32]; // "0x" + 16 hex digits + null terminator + snprintf(luid_buffer, sizeof(luid_buffer), "0x%08x%08x", gpu.luid.HighPart, gpu.luid.LowPart); + std::string gpu_luid_str(luid_buffer); + if (gpu_luid_str == std::string(luid)) { + targetGpu = &gpu; + break; + } + } + if (!targetGpu) { + GGML_LOG_ERROR("GPU with LUID %s not found.\n", luid); + return ERROR_NOT_FOUND; + } + + // Get the current memory usage for the target GPU + int status = get_gpu_memory_usage(*targetGpu); + if (!status) { + GGML_LOG_ERROR("Failed to get GPU memory usage.\n"); + return ERROR_DEVICE_NOT_AVAILABLE; + } + + // Calculate the free memory based on whether it's an integrated or discrete GPU + if (is_integrated_gpu) { + // IGPU free = SharedTotal - SharedUsage + GGML_LOG_DEBUG("Integrated GPU (%ls) with LUID %s detected. Shared Total: %.2f bytes (%.2f GB), Shared Usage: %.2f bytes (%.2f GB), Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->sharedTotal, b_to_gb(targetGpu->sharedTotal), targetGpu->sharedUsage, b_to_gb(targetGpu->sharedUsage), targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage)); + *free = (targetGpu->sharedTotal - targetGpu->sharedUsage) + (targetGpu->dedicatedTotal - targetGpu->dedicatedUsage); // Some IGPUs also have dedicated memory, which can be used along with the IGPU's shared memory + *total = targetGpu->sharedTotal + targetGpu->dedicatedTotal; + } + else { + // DGPU free = DedicatedTotal - DedicatedUsage + GGML_LOG_DEBUG("Discrete GPU (%ls) with LUID %s detected. Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage)); + *free = targetGpu->dedicatedTotal - targetGpu->dedicatedUsage; + *total = targetGpu->dedicatedTotal; + } + + return ERROR_SUCCESS; + } + +} // extern "C" + +#else // #ifdef _WIN32 + +extern "C" { + + // DXGI + PDH not available for Linux implementation + int ggml_dxgi_pdh_init() { + return -1; + } + void ggml_dxgi_pdh_release() {} + int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) { + return -1; + } + +} // extern "C" + +#endif // #ifdef _WIN32 \ No newline at end of file