From 220e133fca8fe128dbf8fecef96c8484f991e39c Mon Sep 17 00:00:00 2001
From: virajwad <84867530+virajwad@users.noreply.github.com>
Date: Tue, 4 Nov 2025 15:11:55 -0700
Subject: [PATCH] vulkan: Add memory detection for Intel GPU using DXGI+PDH
 (#12664)

* PDH free memory skeleton

* Add PDH printing

* Add LUID support for Vulkan

* wire luid from ggml-vulkan to mem-dxgi-pdh file

* Fix to ggml-impl

* Continue skeleton

* Implemented ggml_dxgi_pdh_get_device_memory

* fix comments

* Fix - change value GB to bytes

* add ifdefs to only support windows and not linux

* modify error codes

* Finished ggml_dxgi_pdh_init() function

* completed ggml_dxgi_pdh_release()

* Formatting changes, add static to functions

* fix build errors

* fix go build error

* fix luid - now should match between dxgi and vulkan

* Fix the free memory reporting (was using copy by value, change to reference)

* keep only dxgi1_2.h

* Modifications based on PR feedback

* fix merge conflicts (2) and fix desc1.description printout

* move dxgi + pdh api calls to before the vendor specific library calls

* change from 3 samples to 1 sample for PDH

* modify when old_mode is set

* add fix for building MacOS

* fix release and returns for other vendors

* add patch file
---
 ...-Add-memory-detection-using-DXGI-PDH.patch | 420 ++++++++++++++++++
 ml/backend/ggml/ggml/src/CMakeLists.txt       |   1 +
 ml/backend/ggml/ggml/src/ggml-impl.h          |   3 +
 .../ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp |  29 +-
 ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp     | 297 +++++++++++++
 5 files changed, 747 insertions(+), 3 deletions(-)
 create mode 100644 llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch
 create mode 100644 ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp

diff --git a/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch b/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch
new file mode 100644
index 0000000000..8e5461cbb2
--- /dev/null
+++ b/llama/patches/0031-Add-memory-detection-using-DXGI-PDH.patch
@@ -0,0 +1,420 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Viraj Wadhwa <viraj.wadhwa@intel.com>
+Date: Tue, 4 Nov 2025 12:04:04 -0800
+Subject: [PATCH] Add memory detection using DXGI + PDH
+
+---
+ ggml/src/CMakeLists.txt              |   1 +
+ ggml/src/ggml-impl.h                 |   3 +
+ ggml/src/ggml-vulkan/ggml-vulkan.cpp |  29 ++-
+ ggml/src/mem_dxgi_pdh.cpp            | 297 +++++++++++++++++++++++++++
+ 4 files changed, 327 insertions(+), 3 deletions(-)
+ create mode 100644 ggml/src/mem_dxgi_pdh.cpp
+
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index aefe43bdd..21fe4640c 100644
+--- a/ggml/src/CMakeLists.txt
++++ b/ggml/src/CMakeLists.txt
+@@ -211,6 +211,7 @@ add_library(ggml-base
+             ggml-quants.h
+             mem_hip.cpp
+             mem_nvml.cpp
++            mem_dxgi_pdh.cpp
+             gguf.cpp)
+ 
+ target_include_directories(ggml-base PRIVATE .)
+diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
+index b63edd0c1..81cad8cf3 100644
+--- a/ggml/src/ggml-impl.h
++++ b/ggml/src/ggml-impl.h
+@@ -645,6 +645,9 @@ GGML_API void ggml_nvml_release();
+ GGML_API int ggml_hip_mgmt_init();
+ GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
+ GGML_API void ggml_hip_mgmt_release();
++GGML_API int ggml_dxgi_pdh_init();
++GGML_API int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu);
++GGML_API void ggml_dxgi_pdh_release();
+ 
+ #ifdef __cplusplus
+ }
+diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+index cc68e7968..27d6574da 100644
+--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
+ #define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME                        "VK_KHR_shader_bfloat16"
+ #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000)
+ #define VK_COMPONENT_TYPE_BFLOAT16_KHR                               ((VkComponentTypeKHR)1000141000)
++#define VK_LUID_SIZE_KHR                  VK_LUID_SIZE
+ 
+ typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
+     VkStructureType                       sType;
+@@ -12433,6 +12434,7 @@ struct ggml_backend_vk_device_context {
+     std::string pci_id;
+     std::string id;
+     std::string uuid;
++    std::string luid;
+     std::string numeric_id;
+     int major;
+     int minor;
+@@ -12449,8 +12451,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
+     vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
+     vk::PhysicalDeviceProperties2 props2;
+     vkdev.getProperties2(&props2);
++    GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: uuid %s\n", ctx->uuid.c_str());
++    GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: luid %s\n", ctx->luid.c_str());
++
++    // Check VRAM reporting for Windows IGPU/DGPU using DXGI + PDH (vendor agnostic)
++    if (ggml_dxgi_pdh_init() == 0) {
++        GGML_LOG_DEBUG("DXGI + PDH Initialized. Getting GPU free memory info\n");
++        int status = ggml_dxgi_pdh_get_device_memory(ctx->luid.c_str(), free, total, ctx->is_integrated_gpu);
++        if (status == 0) {
++            GGML_LOG_DEBUG("%s utilizing DXGI + PDH memory reporting free: %zu total: %zu\n", __func__, *free, *total);
++            ggml_dxgi_pdh_release();
++            return;
++        }
++        ggml_dxgi_pdh_release();
++    }
+ 
+-    if (!ctx->is_integrated_gpu)
++    if (!ctx->is_integrated_gpu) 
+     {
+         // Use vendor specific management libraries for best VRAM reporting if available
+         switch (props2.properties.vendorID) {
+@@ -12478,8 +12494,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
+             break;
+         }
+     }
+-    // else fallback to memory budget if supported
+ 
++    // else fallback to memory budget if supported
+     *total = 0;
+     *free = 0;
+     vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
+@@ -13091,7 +13107,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+                     /* .reg     = */ reg,
+                     /* .context = */ ctx,
+                 });
+-
+                 // Gather additional information about the device
+                 int dev_idx = vk_instance.device_indices[i];
+                 vk::PhysicalDeviceProperties props1;
+@@ -13114,6 +13129,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
+                     }
+                 }
+                 ctx->uuid = oss.str();
++                const auto& luid = device_id_props.deviceLUID;
++                char luid_str[32]; // "0x" + 16 hex digits + null terminator = 19 chars
++                snprintf(luid_str, sizeof(luid_str), // high part + low part
++                    "0x%02x%02x%02x%02x%02x%02x%02x%02x",
++                    luid[7], luid[6], luid[5], luid[4],
++                    luid[3], luid[2], luid[1], luid[0]
++                );
++                ctx->luid = std::string(luid_str);
+                 ctx->major = 0;
+                 ctx->minor = 0;
+                 // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
+diff --git a/ggml/src/mem_dxgi_pdh.cpp b/ggml/src/mem_dxgi_pdh.cpp
+new file mode 100644
+index 000000000..2f395761c
+--- /dev/null
++++ b/ggml/src/mem_dxgi_pdh.cpp
+@@ -0,0 +1,297 @@
++// DXGI and PDH Performance Counters Library
++// This Windows-only (10/11) library provides accurate VRAM reporting
++#include "ggml.h"
++#include "ggml-impl.h"
++
++#ifdef _WIN32
++#    define WIN32_LEAN_AND_MEAN
++#    ifndef NOMINMAX
++#        define NOMINMAX
++#    endif
++#include <windows.h>
++#include <pdh.h>
++#include <dxgi1_2.h>
++#include <sstream>
++#include <thread>
++#include <filesystem>
++#include <mutex>
++
++namespace fs = std::filesystem;
++
++static std::mutex ggml_dxgi_pdh_lock;
++
++/*
++Struct to keep track of GPU adapter information at runtime
++*/
++struct GpuInfo {
++    std::wstring description; // debug field
++    LUID luid;
++    std::wstring pdhInstance;
++    double dedicatedTotal = 0.0;
++    double sharedTotal = 0.0;
++    double dedicatedUsage = 0.0;
++    double sharedUsage = 0.0;
++};
++
++/*
++DLL Function Pointers
++*/
++struct {
++    void *dxgi_dll_handle;
++    void *pdh_dll_handle;
++    // DXGI Functions
++    HRESULT (*CreateDXGIFactory1)(REFIID riid, void **ppFactory);
++    // PDH functions  
++    PDH_STATUS (*PdhOpenQueryW)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery);
++    PDH_STATUS (*PdhAddCounterW)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter);
++    PDH_STATUS (*PdhCollectQueryData)(PDH_HQUERY hQuery);
++    PDH_STATUS (*PdhGetFormattedCounterValue)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue);
++    PDH_STATUS (*PdhCloseQuery)(PDH_HQUERY hQuery);
++} dll_functions {
++    nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr
++};
++
++/*
++Create a PDH Instance name
++*/
++static std::wstring generate_pdh_instance_name_from_luid(const LUID& luid) {
++    std::wstringstream ss;
++    ss << L"luid_0x" << std::hex << std::setw(8) << std::setfill(L'0') << std::uppercase << luid.HighPart
++        << L"_0x" << std::setw(8) << std::setfill(L'0') << luid.LowPart;
++    return ss.str();
++}
++
++/*
++Conversion from Bytes to GigaBytes
++*/
++template <typename T>
++static inline double b_to_gb(T n)
++{
++    return (double(n) / (1024.0 * 1024 * 1024));
++}
++
++/*
++Fetch the GPU adapter 'dedicated memory' and 'shared memory' using DXGI
++*/
++static void fetch_dxgi_adapter_desc1(const DXGI_ADAPTER_DESC1& desc, GpuInfo* info) {
++    auto dedicatedVideoMemory = desc.DedicatedVideoMemory;
++    auto sharedSystemMemory = desc.SharedSystemMemory;
++    GGML_LOG_DEBUG("[DXGI] Adapter Description: %ls, LUID: 0x%08X%08X, Dedicated: %.2f GB, Shared: %.2f GB\n", desc.Description, desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart, b_to_gb(dedicatedVideoMemory), b_to_gb(sharedSystemMemory));
++    if (info) {
++        info->dedicatedTotal = dedicatedVideoMemory; // values in bytes
++        info->sharedTotal = sharedSystemMemory;
++    }
++}
++
++/*
++Enumerate over the GPU adapters detected using DXGI and return their information
++*/
++static std::vector<GpuInfo> get_dxgi_gpu_infos() {
++    std::vector<GpuInfo> infos;
++    IDXGIFactory1* pFactory = nullptr;
++
++    if (SUCCEEDED(dll_functions.CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory))) {
++        UINT i = 0;
++        IDXGIAdapter1* pAdapter = nullptr;
++        while (pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND) {
++            DXGI_ADAPTER_DESC1 desc;
++            pAdapter->GetDesc1(&desc);
++            
++            // Get all the GPU adapter info
++            GpuInfo info;
++            fetch_dxgi_adapter_desc1(desc, &info);
++            info.description = std::wstring(desc.Description);
++            info.luid = desc.AdapterLuid;
++            info.pdhInstance = generate_pdh_instance_name_from_luid(desc.AdapterLuid);
++            infos.push_back(info);
++
++            pAdapter->Release();
++            ++i;
++        }
++        pFactory->Release();
++    }
++    return infos;
++}
++
++static bool get_gpu_memory_usage(GpuInfo& gpu) {
++    PDH_HQUERY query;
++    if (dll_functions.PdhOpenQueryW(NULL, 0, &query) != ERROR_SUCCESS) {
++        return false;
++    }
++
++    struct GpuCounters {
++        PDH_HCOUNTER dedicated;
++        PDH_HCOUNTER shared;
++    };
++
++    GpuCounters gpuCounter{};
++
++    std::wstring dedicatedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Dedicated Usage";
++    std::wstring sharedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Shared Usage";
++
++    if (dll_functions.PdhAddCounterW(query, dedicatedPath.c_str(), 0, &gpuCounter.dedicated) != ERROR_SUCCESS ||
++        dll_functions.PdhAddCounterW(query, sharedPath.c_str(), 0, &gpuCounter.shared) != ERROR_SUCCESS) {
++            GGML_LOG_ERROR("Failed to add PDH counters for GPU %s\n", std::string(gpu.pdhInstance.begin(), gpu.pdhInstance.end()).c_str());
++            dll_functions.PdhCloseQuery(query);
++            return false;
++    }
++
++    // Sample the data
++    if (dll_functions.PdhCollectQueryData(query) != ERROR_SUCCESS) {
++            dll_functions.PdhCloseQuery(query);
++            return false;
++    }
++
++    // Read final values
++    PDH_FMT_COUNTERVALUE val;
++
++    if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.dedicated, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
++        gpu.dedicatedUsage = val.doubleValue;
++
++    if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.shared, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
++        gpu.sharedUsage = val.doubleValue;
++
++    dll_functions.PdhCloseQuery(query);
++    return true;
++}
++
++
++extern "C" {
++
++    int ggml_dxgi_pdh_init() {
++        GGML_LOG_DEBUG("%s called\n", __func__);
++        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
++        if (dll_functions.dxgi_dll_handle != NULL && dll_functions.pdh_dll_handle != NULL) {
++            // Already initialized as we have both DLL handles
++            return ERROR_SUCCESS;
++        }
++
++        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
++        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
++        fs::path libPath_dxgi = fs::path("\\Windows") / fs::path("System32") / fs::path("dxgi.dll");
++        fs::path libPath_pdh = fs::path("\\Windows") / fs::path("System32") / fs::path("pdh.dll");
++
++        // Call LoadLibraryW on both DLLs to ensure they are loaded
++        void *dxgi = (void*)LoadLibraryW(libPath_dxgi.wstring().c_str());
++        void *pdh = (void*)LoadLibraryW(libPath_pdh.wstring().c_str());
++        if(dxgi == NULL || pdh == NULL) {
++            if (dxgi != NULL) {
++                FreeLibrary((HMODULE)(dxgi));
++            }
++            if (pdh != NULL) {
++                FreeLibrary((HMODULE)(pdh));
++            }
++            SetErrorMode(old_mode);
++            return ERROR_DLL_NOT_FOUND;
++        }
++        else {
++            // save the dll handles
++            dll_functions.dxgi_dll_handle = dxgi;
++            dll_functions.pdh_dll_handle = pdh;
++        }
++
++        // Get pointers to the library functions loaded by the DLLs
++        dll_functions.CreateDXGIFactory1 = (HRESULT (*)(REFIID riid, void **ppFactory)) GetProcAddress((HMODULE)(dll_functions.dxgi_dll_handle), "CreateDXGIFactory1");
++        dll_functions.PdhOpenQueryW = (PDH_STATUS (*)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhOpenQueryW");
++        dll_functions.PdhAddCounterW = (PDH_STATUS (*)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhAddCounterW");
++        dll_functions.PdhCollectQueryData = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCollectQueryData");
++        dll_functions.PdhGetFormattedCounterValue = (PDH_STATUS (*)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhGetFormattedCounterValue");
++        dll_functions.PdhCloseQuery = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCloseQuery");
++    
++        SetErrorMode(old_mode); // set old mode before any return
++
++        // Check if any function pointers are NULL (not found)
++        if (dll_functions.CreateDXGIFactory1 == NULL || dll_functions.PdhOpenQueryW == NULL || dll_functions.PdhAddCounterW == NULL || dll_functions.PdhCollectQueryData == NULL || dll_functions.PdhGetFormattedCounterValue == NULL || dll_functions.PdhCloseQuery == NULL) {
++            GGML_LOG_INFO("%s unable to locate required symbols in either dxgi.dll or pdh.dll", __func__);
++            FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
++            FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
++            dll_functions.dxgi_dll_handle = NULL;
++            dll_functions.pdh_dll_handle = NULL;
++            return ERROR_PROC_NOT_FOUND;
++        }
++    
++        // No other initializations needed, successfully loaded the libraries and functions!
++        return ERROR_SUCCESS;
++    }
++
++    void ggml_dxgi_pdh_release() {
++        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
++        if (dll_functions.dxgi_dll_handle == NULL && dll_functions.pdh_dll_handle == NULL) {
++            // Already freed the DLLs
++            return;
++        }
++
++        // Call FreeLibrary on both DLLs
++        FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
++        FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
++
++        dll_functions.dxgi_dll_handle = NULL;
++        dll_functions.pdh_dll_handle = NULL;
++
++        return; // successfully released
++    }
++
++    int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
++
++        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
++
++        // Enumerate GPUs using DXGI and find the matching LUID
++        // This also fetches the total memory info for each of the enumerated GPUs
++        std::vector<GpuInfo> gpus = get_dxgi_gpu_infos();
++        GpuInfo *targetGpu = nullptr;
++        for (auto& gpu : gpus) {
++            char luid_buffer[32]; // "0x" + 16 hex digits + null terminator
++            snprintf(luid_buffer, sizeof(luid_buffer), "0x%08x%08x", gpu.luid.HighPart, gpu.luid.LowPart);
++            std::string gpu_luid_str(luid_buffer);
++            if (gpu_luid_str == std::string(luid)) {
++                targetGpu = &gpu;
++                break;
++            }
++        }
++        if (!targetGpu) {
++            GGML_LOG_ERROR("GPU with LUID %s not found.\n", luid);
++            return ERROR_NOT_FOUND;
++        }
++
++        // Get the current memory usage for the target GPU
++        int status = get_gpu_memory_usage(*targetGpu);
++        if (!status) {
++            GGML_LOG_ERROR("Failed to get GPU memory usage.\n");
++            return ERROR_DEVICE_NOT_AVAILABLE;
++        }
++
++        // Calculate the free memory based on whether it's an integrated or discrete GPU
++        if (is_integrated_gpu) {
++            // IGPU free = SharedTotal - SharedUsage
++            GGML_LOG_DEBUG("Integrated GPU (%ls) with LUID %s detected. Shared Total: %.2f bytes (%.2f GB), Shared Usage: %.2f bytes (%.2f GB), Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->sharedTotal, b_to_gb(targetGpu->sharedTotal), targetGpu->sharedUsage, b_to_gb(targetGpu->sharedUsage), targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
++            *free = (targetGpu->sharedTotal - targetGpu->sharedUsage) + (targetGpu->dedicatedTotal - targetGpu->dedicatedUsage); // Some IGPUs also have dedicated memory, which can be used along with the IGPU's shared memory
++            *total = targetGpu->sharedTotal + targetGpu->dedicatedTotal;
++        }
++        else {
++            // DGPU free = DedicatedTotal - DedicatedUsage
++            GGML_LOG_DEBUG("Discrete GPU (%ls) with LUID %s detected. Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
++            *free = targetGpu->dedicatedTotal - targetGpu->dedicatedUsage;
++            *total = targetGpu->dedicatedTotal;
++        }
++
++        return ERROR_SUCCESS;
++    }
++
++} // extern "C"
++
++#else // #ifdef _WIN32
++
++extern "C" {
++
++    // DXGI + PDH not available for Linux implementation
++    int ggml_dxgi_pdh_init() {
++        return -1;
++    }
++    void ggml_dxgi_pdh_release() {}
++    int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
++        return -1;
++    }
++
++} // extern "C"
++
++#endif // #ifdef _WIN32
+\ No newline at end of file
diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt
index aefe43bdd5..21fe4640c1 100644
--- a/ml/backend/ggml/ggml/src/CMakeLists.txt
+++ b/ml/backend/ggml/ggml/src/CMakeLists.txt
@@ -211,6 +211,7 @@ add_library(ggml-base
             ggml-quants.h
             mem_hip.cpp
             mem_nvml.cpp
+            mem_dxgi_pdh.cpp
             gguf.cpp)
 
 target_include_directories(ggml-base PRIVATE .)
diff --git a/ml/backend/ggml/ggml/src/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h
index b63edd0c14..81cad8cf33 100644
--- a/ml/backend/ggml/ggml/src/ggml-impl.h
+++ b/ml/backend/ggml/ggml/src/ggml-impl.h
@@ -645,6 +645,9 @@ GGML_API void ggml_nvml_release();
 GGML_API int ggml_hip_mgmt_init();
 GGML_API int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total);
 GGML_API void ggml_hip_mgmt_release();
+GGML_API int ggml_dxgi_pdh_init();
+GGML_API int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu);
+GGML_API void ggml_dxgi_pdh_release();
 
 #ifdef __cplusplus
 }
diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index cc68e79686..27d6574da6 100644
--- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 #define VK_KHR_SHADER_BFLOAT16_EXTENSION_NAME                        "VK_KHR_shader_bfloat16"
 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR ((VkStructureType)1000141000)
 #define VK_COMPONENT_TYPE_BFLOAT16_KHR                               ((VkComponentTypeKHR)1000141000)
+#define VK_LUID_SIZE_KHR                  VK_LUID_SIZE
 
 typedef struct VkPhysicalDeviceShaderBfloat16FeaturesKHR {
     VkStructureType                       sType;
@@ -12433,6 +12434,7 @@ struct ggml_backend_vk_device_context {
     std::string pci_id;
     std::string id;
     std::string uuid;
+    std::string luid;
     std::string numeric_id;
     int major;
     int minor;
@@ -12449,8 +12451,22 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
     vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
     vk::PhysicalDeviceProperties2 props2;
     vkdev.getProperties2(&props2);
+    GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: uuid %s\n", ctx->uuid.c_str());
+    GGML_LOG_DEBUG("ggml_backend_vk_get_device_memory called: luid %s\n", ctx->luid.c_str());
 
-    if (!ctx->is_integrated_gpu)
+    // Check VRAM reporting for Windows IGPU/DGPU using DXGI + PDH (vendor agnostic)
+    if (ggml_dxgi_pdh_init() == 0) {
+        GGML_LOG_DEBUG("DXGI + PDH Initialized. Getting GPU free memory info\n");
+        int status = ggml_dxgi_pdh_get_device_memory(ctx->luid.c_str(), free, total, ctx->is_integrated_gpu);
+        if (status == 0) {
+            GGML_LOG_DEBUG("%s utilizing DXGI + PDH memory reporting free: %zu total: %zu\n", __func__, *free, *total);
+            ggml_dxgi_pdh_release();
+            return;
+        }
+        ggml_dxgi_pdh_release();
+    }
+
+    if (!ctx->is_integrated_gpu) 
     {
         // Use vendor specific management libraries for best VRAM reporting if available
         switch (props2.properties.vendorID) {
@@ -12478,8 +12494,8 @@ void ggml_backend_vk_get_device_memory(ggml_backend_vk_device_context *ctx, size
             break;
         }
     }
-    // else fallback to memory budget if supported
 
+    // else fallback to memory budget if supported
     *total = 0;
     *free = 0;
     vk::PhysicalDeviceMemoryBudgetPropertiesEXT mem_budget_props;
@@ -13091,7 +13107,6 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                     /* .reg     = */ reg,
                     /* .context = */ ctx,
                 });
-
                 // Gather additional information about the device
                 int dev_idx = vk_instance.device_indices[i];
                 vk::PhysicalDeviceProperties props1;
@@ -13114,6 +13129,14 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                     }
                 }
                 ctx->uuid = oss.str();
+                const auto& luid = device_id_props.deviceLUID;
+                char luid_str[32]; // "0x" + 16 hex digits + null terminator = 19 chars
+                snprintf(luid_str, sizeof(luid_str), // high part + low part
+                    "0x%02x%02x%02x%02x%02x%02x%02x%02x",
+                    luid[7], luid[6], luid[5], luid[4],
+                    luid[3], luid[2], luid[1], luid[0]
+                );
+                ctx->luid = std::string(luid_str);
                 ctx->major = 0;
                 ctx->minor = 0;
                 // TODO regex parse driver_props.driverInfo for a X.Y or X.Y.Z version string
diff --git a/ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp b/ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp
new file mode 100644
index 0000000000..2f395761c5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/mem_dxgi_pdh.cpp
@@ -0,0 +1,297 @@
+// DXGI and PDH Performance Counters Library
+// This Windows-only (10/11) library provides accurate VRAM reporting
+#include "ggml.h"
+#include "ggml-impl.h"
+
+#ifdef _WIN32
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#    endif
+#include <windows.h>
+#include <pdh.h>
+#include <dxgi1_2.h>
+#include <sstream>
+#include <thread>
+#include <filesystem>
+#include <mutex>
+
+namespace fs = std::filesystem;
+
+static std::mutex ggml_dxgi_pdh_lock;
+
+/*
+Struct to keep track of GPU adapter information at runtime
+*/
+struct GpuInfo {
+    std::wstring description; // debug field
+    LUID luid;
+    std::wstring pdhInstance;
+    double dedicatedTotal = 0.0;
+    double sharedTotal = 0.0;
+    double dedicatedUsage = 0.0;
+    double sharedUsage = 0.0;
+};
+
+/*
+DLL Function Pointers
+*/
+struct {
+    void *dxgi_dll_handle;
+    void *pdh_dll_handle;
+    // DXGI Functions
+    HRESULT (*CreateDXGIFactory1)(REFIID riid, void **ppFactory);
+    // PDH functions  
+    PDH_STATUS (*PdhOpenQueryW)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery);
+    PDH_STATUS (*PdhAddCounterW)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter);
+    PDH_STATUS (*PdhCollectQueryData)(PDH_HQUERY hQuery);
+    PDH_STATUS (*PdhGetFormattedCounterValue)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue);
+    PDH_STATUS (*PdhCloseQuery)(PDH_HQUERY hQuery);
+} dll_functions {
+    nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr
+};
+
+/*
+Create a PDH Instance name
+*/
+static std::wstring generate_pdh_instance_name_from_luid(const LUID& luid) {
+    std::wstringstream ss;
+    ss << L"luid_0x" << std::hex << std::setw(8) << std::setfill(L'0') << std::uppercase << luid.HighPart
+        << L"_0x" << std::setw(8) << std::setfill(L'0') << luid.LowPart;
+    return ss.str();
+}
+
+/*
+Conversion from Bytes to GigaBytes
+*/
+template <typename T>
+static inline double b_to_gb(T n)
+{
+    return (double(n) / (1024.0 * 1024 * 1024));
+}
+
+/*
+Fetch the GPU adapter 'dedicated memory' and 'shared memory' using DXGI
+*/
+static void fetch_dxgi_adapter_desc1(const DXGI_ADAPTER_DESC1& desc, GpuInfo* info) {
+    auto dedicatedVideoMemory = desc.DedicatedVideoMemory;
+    auto sharedSystemMemory = desc.SharedSystemMemory;
+    GGML_LOG_DEBUG("[DXGI] Adapter Description: %ls, LUID: 0x%08X%08X, Dedicated: %.2f GB, Shared: %.2f GB\n", desc.Description, desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart, b_to_gb(dedicatedVideoMemory), b_to_gb(sharedSystemMemory));
+    if (info) {
+        info->dedicatedTotal = dedicatedVideoMemory; // values in bytes
+        info->sharedTotal = sharedSystemMemory;
+    }
+}
+
+/*
+Enumerate over the GPU adapters detected using DXGI and return their information
+*/
+static std::vector<GpuInfo> get_dxgi_gpu_infos() {
+    std::vector<GpuInfo> infos;
+    IDXGIFactory1* pFactory = nullptr;
+
+    if (SUCCEEDED(dll_functions.CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&pFactory))) {
+        UINT i = 0;
+        IDXGIAdapter1* pAdapter = nullptr;
+        while (pFactory->EnumAdapters1(i, &pAdapter) != DXGI_ERROR_NOT_FOUND) {
+            DXGI_ADAPTER_DESC1 desc;
+            pAdapter->GetDesc1(&desc);
+            
+            // Get all the GPU adapter info
+            GpuInfo info;
+            fetch_dxgi_adapter_desc1(desc, &info);
+            info.description = std::wstring(desc.Description);
+            info.luid = desc.AdapterLuid;
+            info.pdhInstance = generate_pdh_instance_name_from_luid(desc.AdapterLuid);
+            infos.push_back(info);
+
+            pAdapter->Release();
+            ++i;
+        }
+        pFactory->Release();
+    }
+    return infos;
+}
+
+static bool get_gpu_memory_usage(GpuInfo& gpu) {
+    PDH_HQUERY query;
+    if (dll_functions.PdhOpenQueryW(NULL, 0, &query) != ERROR_SUCCESS) {
+        return false;
+    }
+
+    struct GpuCounters {
+        PDH_HCOUNTER dedicated;
+        PDH_HCOUNTER shared;
+    };
+
+    GpuCounters gpuCounter{};
+
+    std::wstring dedicatedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Dedicated Usage";
+    std::wstring sharedPath = L"\\GPU Adapter Memory(" + gpu.pdhInstance + L"*)\\Shared Usage";
+
+    if (dll_functions.PdhAddCounterW(query, dedicatedPath.c_str(), 0, &gpuCounter.dedicated) != ERROR_SUCCESS ||
+        dll_functions.PdhAddCounterW(query, sharedPath.c_str(), 0, &gpuCounter.shared) != ERROR_SUCCESS) {
+            GGML_LOG_ERROR("Failed to add PDH counters for GPU %s\n", std::string(gpu.pdhInstance.begin(), gpu.pdhInstance.end()).c_str());
+            dll_functions.PdhCloseQuery(query);
+            return false;
+    }
+
+    // Sample the data
+    if (dll_functions.PdhCollectQueryData(query) != ERROR_SUCCESS) {
+            dll_functions.PdhCloseQuery(query);
+            return false;
+    }
+
+    // Read final values
+    PDH_FMT_COUNTERVALUE val;
+
+    if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.dedicated, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
+        gpu.dedicatedUsage = val.doubleValue;
+
+    if (dll_functions.PdhGetFormattedCounterValue(gpuCounter.shared, PDH_FMT_DOUBLE, NULL, &val) == ERROR_SUCCESS)
+        gpu.sharedUsage = val.doubleValue;
+
+    dll_functions.PdhCloseQuery(query);
+    return true;
+}
+
+
+extern "C" {
+
+    int ggml_dxgi_pdh_init() {
+        GGML_LOG_DEBUG("%s called\n", __func__);
+        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
+        if (dll_functions.dxgi_dll_handle != NULL && dll_functions.pdh_dll_handle != NULL) {
+            // Already initialized as we have both DLL handles
+            return ERROR_SUCCESS;
+        }
+
+        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+        fs::path libPath_dxgi = fs::path("\\Windows") / fs::path("System32") / fs::path("dxgi.dll");
+        fs::path libPath_pdh = fs::path("\\Windows") / fs::path("System32") / fs::path("pdh.dll");
+
+        // Call LoadLibraryW on both DLLs to ensure they are loaded
+        void *dxgi = (void*)LoadLibraryW(libPath_dxgi.wstring().c_str());
+        void *pdh = (void*)LoadLibraryW(libPath_pdh.wstring().c_str());
+        if(dxgi == NULL || pdh == NULL) {
+            if (dxgi != NULL) {
+                FreeLibrary((HMODULE)(dxgi));
+            }
+            if (pdh != NULL) {
+                FreeLibrary((HMODULE)(pdh));
+            }
+            SetErrorMode(old_mode);
+            return ERROR_DLL_NOT_FOUND;
+        }
+        else {
+            // save the dll handles
+            dll_functions.dxgi_dll_handle = dxgi;
+            dll_functions.pdh_dll_handle = pdh;
+        }
+
+        // Get pointers to the library functions loaded by the DLLs
+        dll_functions.CreateDXGIFactory1 = (HRESULT (*)(REFIID riid, void **ppFactory)) GetProcAddress((HMODULE)(dll_functions.dxgi_dll_handle), "CreateDXGIFactory1");
+        dll_functions.PdhOpenQueryW = (PDH_STATUS (*)(LPCWSTR szDataSource, DWORD_PTR dwUserData, PDH_HQUERY *phQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhOpenQueryW");
+        dll_functions.PdhAddCounterW = (PDH_STATUS (*)(PDH_HQUERY hQuery, LPCWSTR szFullCounterPath, DWORD_PTR dwUserData, PDH_HCOUNTER *phCounter)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhAddCounterW");
+        dll_functions.PdhCollectQueryData = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCollectQueryData");
+        dll_functions.PdhGetFormattedCounterValue = (PDH_STATUS (*)(PDH_HCOUNTER hCounter, DWORD dwFormat, LPDWORD lpdwType, PPDH_FMT_COUNTERVALUE pValue)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhGetFormattedCounterValue");
+        dll_functions.PdhCloseQuery = (PDH_STATUS (*)(PDH_HQUERY hQuery)) GetProcAddress((HMODULE)(dll_functions.pdh_dll_handle), "PdhCloseQuery");
+    
+        SetErrorMode(old_mode); // set old mode before any return
+
+        // Check if any function pointers are NULL (not found)
+        if (dll_functions.CreateDXGIFactory1 == NULL || dll_functions.PdhOpenQueryW == NULL || dll_functions.PdhAddCounterW == NULL || dll_functions.PdhCollectQueryData == NULL || dll_functions.PdhGetFormattedCounterValue == NULL || dll_functions.PdhCloseQuery == NULL) {
+            GGML_LOG_INFO("%s unable to locate required symbols in either dxgi.dll or pdh.dll", __func__);
+            FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
+            FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
+            dll_functions.dxgi_dll_handle = NULL;
+            dll_functions.pdh_dll_handle = NULL;
+            return ERROR_PROC_NOT_FOUND;
+        }
+    
+        // No other initializations needed, successfully loaded the libraries and functions!
+        return ERROR_SUCCESS;
+    }
+
+    void ggml_dxgi_pdh_release() {
+        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
+        if (dll_functions.dxgi_dll_handle == NULL && dll_functions.pdh_dll_handle == NULL) {
+            // Already freed the DLLs
+            return;
+        }
+
+        // Call FreeLibrary on both DLLs
+        FreeLibrary((HMODULE)(dll_functions.dxgi_dll_handle));
+        FreeLibrary((HMODULE)(dll_functions.pdh_dll_handle));
+
+        dll_functions.dxgi_dll_handle = NULL;
+        dll_functions.pdh_dll_handle = NULL;
+
+        return; // successfully released
+    }
+
+    int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
+
+        std::lock_guard<std::mutex> lock(ggml_dxgi_pdh_lock);
+
+        // Enumerate GPUs using DXGI and find the matching LUID
+        // This also fetches the total memory info for each of the enumerated GPUs
+        std::vector<GpuInfo> gpus = get_dxgi_gpu_infos();
+        GpuInfo *targetGpu = nullptr;
+        for (auto& gpu : gpus) {
+            char luid_buffer[32]; // "0x" + 16 hex digits + null terminator
+            snprintf(luid_buffer, sizeof(luid_buffer), "0x%08x%08x", gpu.luid.HighPart, gpu.luid.LowPart);
+            std::string gpu_luid_str(luid_buffer);
+            if (gpu_luid_str == std::string(luid)) {
+                targetGpu = &gpu;
+                break;
+            }
+        }
+        if (!targetGpu) {
+            GGML_LOG_ERROR("GPU with LUID %s not found.\n", luid);
+            return ERROR_NOT_FOUND;
+        }
+
+        // Get the current memory usage for the target GPU
+        int status = get_gpu_memory_usage(*targetGpu);
+        if (!status) {
+            GGML_LOG_ERROR("Failed to get GPU memory usage.\n");
+            return ERROR_DEVICE_NOT_AVAILABLE;
+        }
+
+        // Calculate the free memory based on whether it's an integrated or discrete GPU
+        if (is_integrated_gpu) {
+            // IGPU free = SharedTotal - SharedUsage
+            GGML_LOG_DEBUG("Integrated GPU (%ls) with LUID %s detected. Shared Total: %.2f bytes (%.2f GB), Shared Usage: %.2f bytes (%.2f GB), Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->sharedTotal, b_to_gb(targetGpu->sharedTotal), targetGpu->sharedUsage, b_to_gb(targetGpu->sharedUsage), targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
+            *free = (targetGpu->sharedTotal - targetGpu->sharedUsage) + (targetGpu->dedicatedTotal - targetGpu->dedicatedUsage); // Some IGPUs also have dedicated memory, which can be used along with the IGPU's shared memory
+            *total = targetGpu->sharedTotal + targetGpu->dedicatedTotal;
+        }
+        else {
+            // DGPU free = DedicatedTotal - DedicatedUsage
+            GGML_LOG_DEBUG("Discrete GPU (%ls) with LUID %s detected. Dedicated Total: %.2f bytes (%.2f GB), Dedicated Usage: %.2f bytes (%.2f GB)\n", targetGpu->description.c_str(), luid, targetGpu->dedicatedTotal, b_to_gb(targetGpu->dedicatedTotal), targetGpu->dedicatedUsage, b_to_gb(targetGpu->dedicatedUsage));
+            *free = targetGpu->dedicatedTotal - targetGpu->dedicatedUsage;
+            *total = targetGpu->dedicatedTotal;
+        }
+
+        return ERROR_SUCCESS;
+    }
+
+} // extern "C"
+
+#else // #ifdef _WIN32
+
+extern "C" {
+
+    // DXGI + PDH not available for Linux implementation
+    int ggml_dxgi_pdh_init() {
+        return -1;
+    }
+    void ggml_dxgi_pdh_release() {}
+    int ggml_dxgi_pdh_get_device_memory(const char* luid, size_t *free, size_t *total, bool is_integrated_gpu) {
+        return -1;
+    }
+
+} // extern "C"
+
+#endif // #ifdef _WIN32
\ No newline at end of file