DirectMLNpuInference fails to run on the ARM64 NPU #640

xiaoweiChen · 2024-08-28T03:27:28Z

Base on DirectMLNpuInference sample and #625(update SDK to Windows 11 SDK(10.0.26100.0)).

I make DirectML NPU work well on Intel Lunar Lake Client Platform, I see the NPU usage not 0 when running program on Windows Task Manager.

While, I try this sample on My Windows ARM64 machine, the program would tell me "No NPU device found"...

Anyone know the reason?
Is DirectML NPU not support ARM64 platform?

My device info:
CPU: Snapdragon(R) X 12-core X1E80100 @ 3.40 GHz
GPU: Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Adreno(TM) GPU
NPU: Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Hexagon(TM) NPU

My test code:

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "pch.h"

#include <dxcore_interface.h>
#include <dxcore.h>

#include "onnxruntime_cxx_api.h"
#include "dml_provider_factory.h"

#include "TensorHelper.h"

using Microsoft::WRL::ComPtr;

void InitializeDirectML(ID3D12Device1** d3dDeviceOut, ID3D12CommandQueue** commandQueueOut, IDMLDevice** dmlDeviceOut) {
    // Whether to skip adapters which support Graphics in order to target NPU for testing
    bool forceComputeOnlyDevice = true;
    bool forceGenericMLDevice = false;
    
    ComPtr<IDXCoreAdapterFactory> factory;
    HMODULE dxCoreModule = LoadLibraryW(L"DXCore.dll");
    if (dxCoreModule)
    {
        auto dxcoreCreateAdapterFactory = reinterpret_cast<HRESULT(WINAPI*)(REFIID, void**)>(
            GetProcAddress(dxCoreModule, "DXCoreCreateAdapterFactory")
            );
        if (dxcoreCreateAdapterFactory)
        {
            dxcoreCreateAdapterFactory(IID_PPV_ARGS(&factory));
        }
    }
    // Create the DXCore Adapter
    ComPtr<IDXCoreAdapter> adapter;
    if (factory)
    {
#if 1
        const GUID dxGUIDs[] = { 
            DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE,
            DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU
        };
        ComPtr<IDXCoreAdapterList> adapterList;
        THROW_IF_FAILED(factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, IID_PPV_ARGS(&adapterList)));
        for (uint32_t i = 0, adapterCount = adapterList->GetAdapterCount(); i < adapterCount; i++)
        {
            ComPtr<IDXCoreAdapter> currentGpuAdapter;
            THROW_IF_FAILED(adapterList->GetAdapter(static_cast<uint32_t>(i), IID_PPV_ARGS(&currentGpuAdapter)));

            if (!forceComputeOnlyDevice && !forceGenericMLDevice)
            {
                // No device restrictions
                adapter = std::move(currentGpuAdapter);
                break;
            }
            else if (forceComputeOnlyDevice && currentGpuAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE))
            {
                adapter = std::move(currentGpuAdapter);
                break;
            }
            else if (forceGenericMLDevice && currentGpuAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML))
            {
                adapter = std::move(currentGpuAdapter);
                break;
            }
        }
#else
        const GUID dxGUIDs[] = { DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML };
        ComPtr<IDXCoreAdapterList> adapterList;
        THROW_IF_FAILED(factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, IID_PPV_ARGS(&adapterList)));
        for (uint32_t i = 0, adapterCount = adapterList->GetAdapterCount(); i < adapterCount; i++)
        {
            ComPtr<IDXCoreAdapter> nextGpuAdapter;
            THROW_IF_FAILED(adapterList->GetAdapter(static_cast<uint32_t>(i), IID_PPV_ARGS(&nextGpuAdapter)));
            if (nextGpuAdapter->IsAttributeSupported(DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU))
            {
                adapter = std::move(nextGpuAdapter);
                break;
            }
        }
#endif
    }
    // Create the D3D12 Device
    ComPtr<ID3D12Device1> d3dDevice;
    if (adapter)
    {
        HMODULE d3d12Module = LoadLibraryW(L"d3d12.dll");
        if (d3d12Module)
        {
            auto d3d12CreateDevice = reinterpret_cast<HRESULT(WINAPI*)(IUnknown*, D3D_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(d3d12Module, "D3D12CreateDevice")
                );
            if (d3d12CreateDevice)
            {
                THROW_IF_FAILED(d3d12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_1_0_CORE, IID_PPV_ARGS(&d3dDevice)));
            }
        }
    }
    // Create the DML Device and D3D12 Command Queue
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    if (d3dDevice)
    {
        D3D12_COMMAND_QUEUE_DESC queueDesc = {};
        queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
        THROW_IF_FAILED(d3dDevice->CreateCommandQueue(
            &queueDesc,
            IID_PPV_ARGS(commandQueue.ReleaseAndGetAddressOf())));
        HMODULE dmlModule = LoadLibraryW(L"DirectML.dll");
        if (dmlModule)
        {
            auto dmlCreateDevice = reinterpret_cast<HRESULT(WINAPI*)(ID3D12Device*, DML_CREATE_DEVICE_FLAGS, DML_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(dmlModule, "DMLCreateDevice1")
                );
            if (dmlCreateDevice)
            {
                THROW_IF_FAILED(dmlCreateDevice(d3dDevice.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(dmlDevice.ReleaseAndGetAddressOf())));
            }
        }
    }

    d3dDevice.CopyTo(d3dDeviceOut);
    commandQueue.CopyTo(commandQueueOut);
    dmlDevice.CopyTo(dmlDeviceOut);
}

void main()
{
    ComPtr<ID3D12Device1> d3dDevice;
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    InitializeDirectML(d3dDevice.GetAddressOf(), commandQueue.GetAddressOf(), dmlDevice.GetAddressOf());

    // Add the DML execution provider to ORT using the DML Device and D3D12 Command Queue created above.
    if (!dmlDevice)
    {
        printf("No NPU device found\n");
        return;
    }

    const OrtApi& ortApi = Ort::GetApi();
    static Ort::Env s_OrtEnv{ nullptr };
    s_OrtEnv = Ort::Env(Ort::ThreadingOptions{});
    s_OrtEnv.DisableTelemetryEvents();

    auto sessionOptions = Ort::SessionOptions{};
    sessionOptions.DisableMemPattern();
    sessionOptions.DisablePerSessionThreads();
    sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
    const OrtDmlApi* ortDmlApi = nullptr;
    Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
    Ort::ThrowOnError(ortDmlApi->SessionOptionsAppendExecutionProvider_DML1(sessionOptions, dmlDevice.Get(), commandQueue.Get()));

    // Create the session
    auto session = Ort::Session(s_OrtEnv, L"mobilenetv2-7-fp16.onnx", sessionOptions);
    const char* inputName = "input";
    const char* outputName = "output";

    // Create input tensor
    Ort::TypeInfo type_info = session.GetInputTypeInfo(0);
    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
    auto input = CreateDmlValue(tensor_info, commandQueue.Get());
    auto inputTensor = std::move(input.first);
    
    const auto memoryInfo = inputTensor.GetTensorMemoryInfo();
    Ort::Allocator allocator(session, memoryInfo);
    
    // Get the inputResource and populate!
    ComPtr<ID3D12Resource> inputResource;
    Ort::ThrowOnError(ortDmlApi->GetD3D12ResourceFromAllocation(allocator, inputTensor.GetTensorMutableData<void*>(), &inputResource));

    // Create output tensor
    type_info = session.GetOutputTypeInfo(0);
    tensor_info = type_info.GetTensorTypeAndShapeInfo();
    auto output = CreateDmlValue(tensor_info, commandQueue.Get());
    auto outputTensor = std::move(output.first);

    // Run warmup
    session.Run(Ort::RunOptions{ nullptr }, &inputName, &inputTensor, 1, &outputName, &outputTensor, 1);

    // Queue fence, and wait for completion
    ComPtr<ID3D12Fence> fence;
    THROW_IF_FAILED(d3dDevice->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(fence.GetAddressOf())));
    THROW_IF_FAILED(commandQueue->Signal(fence.Get(), 1));

    wil::unique_handle fenceEvent(CreateEvent(nullptr, FALSE, FALSE, nullptr));
    THROW_IF_FAILED(fence->SetEventOnCompletion(1, fenceEvent.get()));
    THROW_HR_IF(E_FAIL, WaitForSingleObject(fenceEvent.get(), INFINITE) != WAIT_OBJECT_0);

    // Record start
    auto start = std::chrono::high_resolution_clock::now();

    // Run performance test
    constexpr int fenceValueStart = 2;
    constexpr int numIterations = 100;
    for (int i = fenceValueStart; i < (numIterations + fenceValueStart); i++)
    {
        session.Run(Ort::RunOptions{ nullptr }, &inputName, &inputTensor, 1, &outputName, &outputTensor, 1);

        {
            // Synchronize with CPU before queuing more inference runs
            THROW_IF_FAILED(commandQueue->Signal(fence.Get(), i));
            THROW_HR_IF(E_FAIL, ResetEvent(fenceEvent.get()) == 0);
            THROW_IF_FAILED(fence->SetEventOnCompletion(i, fenceEvent.get()));
            THROW_HR_IF(E_FAIL, WaitForSingleObject(fenceEvent.get(), INFINITE) != WAIT_OBJECT_0);
        }
    }

    // Record end and calculate duration
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::micro> duration = end - start;
    printf("Evaluate Took: %fus\n", float(duration.count())/100);

    // Read results
    ComPtr<ID3D12Resource> outputResource;
    Ort::ThrowOnError(ortDmlApi->GetD3D12ResourceFromAllocation(allocator, outputTensor.GetTensorMutableData<void*>(), &outputResource));
}

The text was updated successfully, but these errors were encountered:

xiaoweiChen · 2024-08-29T07:56:15Z

And Could you add some build script(such as CMakeLists.txt) in to sample project?

xiaoweiChen · 2024-09-02T08:12:07Z

Base on this blog. The DirectML version need 1.15.2, and onnxruntime need 1.18.

The native demo use 1.17 onnxruntime....

https://github.com/microsoft/DirectML/blob/master/Samples/DirectMLNpuInference/packages.config

<?xml version="1.0" encoding="utf-8"?>
<packages>
  <package id="Microsoft.AI.DirectML" version="1.15.2" targetFramework="native" />
  <package id="Microsoft.AI.MachineLearning" version="1.17.0" targetFramework="native" />
  <package id="Microsoft.Windows.ImplementationLibrary" version="1.0.220914.1" targetFramework="native" />
</packages>

mrsabhar · 2024-09-03T17:15:13Z

Seeing similar issue on ASUS X Elite. Updated drivers based on the blog. WebNN gives driver not installed.
[10:05:43] [Config] Demo config updated · resnet-50 · webnn · npu
[10:05:43] [Error] UnknownError: Failed to execute 'createContext' on 'ML': DirectML: Failed to create a WebNN context.
[10:05:43] [Error] Your device probably doesn't have an AI processor (NPU) or the NPU driver is not successfully installed

fobrs · 2024-10-19T15:28:20Z

It compiles and runs now ok! (I can compile and run this sample on a Snapdragon Dev Kit. But it only uses the GPU and not the NPU. What's wrong?)

ashumish-QCOM · 2024-10-25T15:42:06Z

Hi @xiaoweiChen

Snapdragon X Elite NPU should be supported. However, it seems there might be an issue with the DirectML setup on ARM64 platforms. Ensuring that you have the correct drivers and that your environment paths are correctly set is crucial.

For experiencing issues with the NPU not being detected, please ensure that all necessary libraries, such as libQnnHtp.so, libQnnHtpV73Stub.so, libQnnHtpV73Skel.so, and others, are correctly placed in the appropriate directories. Additionally, updating to the latest versions of DirectML and ONNX Runtime as specified in the documentation might resolve some compatibility issues.

Thankyou

xiaoweiChen · 2024-10-28T13:43:52Z

Hi @ashumish-QCOM

Thanks for your reply!

My NPU driver version: 30.0.31.50, update at 2024/5/20.

Before see your words, I think if windows for arm with Qualcomm chip drived well (e.g. see the GPU and NPU in Windows Task Manager), the necessary libraries(whatever static or shared libraries) would install well for develper or user.

Well, I read the onnxruntime-qnn page, and download Qualcomm® AI Engine Direct SDK from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct. After install, I see the libraries which you mentioned.

En, the question come, where is the "correctly placed in the appropriate directories"? Could you give me a sample for this, or the doc address for this? Let me or us know which version for DirectML and ONNXRuntime would be worked well with Snapdragon NPU.

ashumish-QCOM · 2024-10-28T16:13:24Z

Hi @xiaoweiChen

Thank you for providing the detailed information. Here are a few steps to help troubleshoot and potentially resolve the issue:

Driver and SDK Compatibility:
- Verify that you are using the correct versions of DirectML and ONNX Runtime. Based on the documentation, DirectML version 1.15.2 and ONNX Runtime version 1.18 are recommended.

https://blogs.windows.com/windowsdeveloper/2024/08/29/directml-expands-npu-support-to-copilot-pcs-and-webnn/

Library Placement:
- Make sure that the necessary libraries (libQnnHtp.so, libQnnHtpV73Stub.so, libQnnHtpV73Skel.so, etc.) are correctly placed in the directories specified by your development environment. These libraries should typically be in the same directory as your executable or in a directory included in your system's library path.
Environment Variables:
- Verify that your environment variables are set correctly. You might need to add the library paths to the LD_LIBRARY_PATH or equivalent environment variable on your system.
Adapter Selection:
- In your code, ensure that the adapter selection logic correctly identifies and selects the NPU. The DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU attribute should be used to filter and select the NPU adapter.
Debugging and Logging:
- Add additional logging to your code to verify that the adapter enumeration and selection process is working as expected. This can help identify if the NPU is being detected but not selected correctly.
Sample Code Adjustments:
- Ensure that the sample code is correctly configured to target the NPU. You might need to adjust the forceComputeOnlyDevice and forceGenericMLDevice flags based on your specific requirements.

fobrs · 2024-10-28T16:50:28Z

Without doing all the steps above this DirectML sample runs ok on the Snapdragon Dev Kit. See image below.

Please can you investigate the issue with the other sample: DirectML_ESRGAN?

xiaoweiChen · 2024-10-29T02:21:41Z

Without doing all the steps above this DirectML sample runs ok on the Snapdragon Dev Kit. See image below.

Please can you investigate the issue with the other sample: DirectML_ESRGAN?

Hi @fobrs , Could you share the NPU driver, DirectML and ONNXRuntime version for us? I want to make program find NPU and the NPU usage is not 0% in Windows Task Manager.

xiaoweiChen · 2024-10-29T02:23:01Z

Hi @ashumish-QCOM ,

Thank you for the steps. I will try them in my side. If there have any progress, I will update here.

fobrs · 2024-10-29T13:46:59Z

Hi @fobrs , Could you share the NPU driver, DirectML and ONNXRuntime version for us? I want to make program find NPU and the NPU usage is not 0% in Windows Task Manager.

Qualcomm(R) Hexagon(TM) NPU Version 30.0.31.8

These modules are loaded into the process:
qcnspdx12arm64xum.dll C:\Windows\System32\DriverStore\FileRepository\qcnspmcdm8380.inf_arm64_a9e8953bd7d0d89b\qcnspdx12arm64xum.dll N/A N/A Symbol loading disabled by Include/Exclude setting. 28 30.0.0031.0008 5/17/2024 1:20 00007FFB05FC0000-00007FFB064CF000 [17232] DirectMLNpuInference.exe
QnnHtp.dll C:\Windows\System32\DriverStore\FileRepository\qcnspmcdm8380.inf_arm64_a9e8953bd7d0d89b\QnnHtp.dll N/A N/A Symbol loading disabled by Include/Exclude setting. 35 4/1/2024 8:59 00007FFB05CD0000-00007FFB05FC0000 [17232] DirectMLNpuInference.exe
QnnHtpPrepareDrv.dll C:\Windows\System32\DriverStore\FileRepository\qcnspmcdm8380.inf_arm64_a9e8953bd7d0d89b\HTP\QnnHtpPrepareDrv.dll N/A N/A Symbol loading disabled by Include/Exclude setting. 59 4/1/2024 9:04 00007FFAFD220000-00007FFB01C6B000 [17232] DirectMLNpuInference.exe
QnnHtpV73StubDrv.dll C:\Windows\System32\DriverStore\FileRepository\qcnspmcdm8380.inf_arm64_a9e8953bd7d0d89b\HTP\QnnHtpV73StubDrv.dll N/A N/A Symbol loading disabled by Include/Exclude setting. 49 4/1/2024 8:58 00007FFB114F0000-00007FFB11529000 [17232] DirectMLNpuInference.exe
QnnSystem.dll C:\Windows\System32\DriverStore\FileRepository\qcnspmcdm8380.inf_arm64_a9e8953bd7d0d89b\QnnSystem.dll N/A N/A Symbol loading disabled by Include/Exclude setting. 36 4/1/2024 7:54 00007FFB2FBF0000-00007FFB2FC20000 [17232] DirectMLNpuInference.exe

Microsoft.AI.DirectML 1.15.4
Microsoft.AI.MachineLearning 1.19.2

xiaoweiChen changed the title ~~DirectMLNpuInference fails to run on the ARM NPU~~ DirectMLNpuInference fails to run on the ARM64 NPU Aug 28, 2024

github-staff deleted a comment Aug 28, 2024

github-staff deleted a comment from wuliaodexiaoluo Aug 28, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

DirectMLNpuInference fails to run on the ARM64 NPU #640

DirectMLNpuInference fails to run on the ARM64 NPU #640

xiaoweiChen commented Aug 28, 2024

xiaoweiChen commented Aug 29, 2024

xiaoweiChen commented Sep 2, 2024 •

edited

Loading

mrsabhar commented Sep 3, 2024

fobrs commented Oct 19, 2024 •

edited

Loading

ashumish-QCOM commented Oct 25, 2024

xiaoweiChen commented Oct 28, 2024

ashumish-QCOM commented Oct 28, 2024

fobrs commented Oct 28, 2024

xiaoweiChen commented Oct 29, 2024 •

edited

Loading

xiaoweiChen commented Oct 29, 2024

fobrs commented Oct 29, 2024

DirectMLNpuInference fails to run on the ARM64 NPU #640

DirectMLNpuInference fails to run on the ARM64 NPU #640

Comments

xiaoweiChen commented Aug 28, 2024

xiaoweiChen commented Aug 29, 2024

xiaoweiChen commented Sep 2, 2024 • edited Loading

mrsabhar commented Sep 3, 2024

fobrs commented Oct 19, 2024 • edited Loading

ashumish-QCOM commented Oct 25, 2024

xiaoweiChen commented Oct 28, 2024

ashumish-QCOM commented Oct 28, 2024

fobrs commented Oct 28, 2024

xiaoweiChen commented Oct 29, 2024 • edited Loading

xiaoweiChen commented Oct 29, 2024

fobrs commented Oct 29, 2024

xiaoweiChen commented Sep 2, 2024 •

edited

Loading

fobrs commented Oct 19, 2024 •

edited

Loading

xiaoweiChen commented Oct 29, 2024 •

edited

Loading