From 4f0d56fb5d886a899b4f81e7a41248eff1aea4fc Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Sat, 4 Nov 2023 02:04:28 +1000 Subject: [PATCH] Update XNNPACK to latest version (#18038) ### Description Update XNNPACK to latest version - adds fp16 kernels and various other improvements - requires pthreadpool update as well Most code updates in the XNNPACK EP are to adjust to the new XNNPACK API - 'setup' is split into 'reshape' and 'setup' - some ops use a workspace buffer - copied workspace allocation from XNNPACK unit test code - some suffixes changed Added wrapper for XNNPACK caches to base XNNPACK EP kernel - simplifies usage - XNNPACK split out the code and weights caches, but the code cache isn't currently usable via the public API - we could use the internal types if we think it's required for performance reasons. non-trivial though as we'd need to propagate ifdef values from the XNNPACK build up to the ORT build. - using XNNPACK internals would also mean we would not be able to support using a pre-build XNNPACK package - not an issue currently Fixed opset registration for internal NHWC domain - was not being tied to the ONNX version, so nodes inserted by layout transformation had the incorrect opset - a number of other places needed updating once this issue was fixed Remove support for NCHW Resize from XNNPACK EP so it's NHWC only - we only supported NCHW for fp32, - doing so adds complexity in multiple places (XNNPACK EP kernel implementation, layout transformation and transpose optimization) - unclear if that complexity provides any benefit. can add back if required by production scenario ### Motivation and Context We're looking at enabling fp16 support for CoreML and NNAPI. If we do that we need a good fallback story if the CPU EP will be used. The XNNPACK fp16 kernels will hopefully provide that. NOTE: This PR doesn't add fp16 support to the XNNPACK EP kernels. That can be done as required in separate EPs and should be relatively simple to do. --- web/docs/webgpu-operators.md | 10 +++++----- web/test/test-runner.ts | 5 ++++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/web/docs/webgpu-operators.md b/web/docs/webgpu-operators.md index 5b94a4a510934..0b82a9c031baa 100644 --- a/web/docs/webgpu-operators.md +++ b/web/docs/webgpu-operators.md @@ -20,15 +20,15 @@ Do not modify directly.* | Asinh | ai.onnx(9+) | | | Atan | ai.onnx(7+) | | | Atanh | ai.onnx(9+) | | -| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(11+) | need perf optimization; need implementing activation | +| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(7-9,10,11+) | need perf optimization; need implementing activation | | BiasAdd | com.microsoft(1+) | | | BiasSplitGelu | com.microsoft(1+) | | | Cast | ai.onnx(6-8,9-12,13-18,19+) | | | Ceil | ai.onnx(6-12,13+) | | | Clip | ai.onnx(6-10,11,12,13+) | | | Concat | ai.onnx(1-3,4-10,11-12,13+) | | -| Conv | ai.onnx(1-10,11+); com.ms.internal.nhwc(11+) | need perf optimization; conv3d is not supported; need implementing activation | -| ConvTranspose | ai.onnx(1-10,11+); com.ms.internal.nhwc(11+) | need perf optimization; ConvTranspose3d is not supported; need implementing activation | +| Conv | ai.onnx(1-10,11+); com.ms.internal.nhwc(1-10,11+) | need perf optimization; conv3d is not supported; need implementing activation | +| ConvTranspose | ai.onnx(1-10,11+); com.ms.internal.nhwc(1-10,11+) | need perf optimization; ConvTranspose3d is not supported; need implementing activation | | Cos | ai.onnx(7+) | | | Cosh | ai.onnx(9+) | | | Div | ai.onnx(7-12,13,14+) | | @@ -57,7 +57,7 @@ Do not modify directly.* | LessOrEqual | ai.onnx(12-15,16+) | | | Log | ai.onnx(6-12,13+) | | | MatMul | ai.onnx(1-12,13+) | | -| MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(11,12+) | need perf optimization; need implementing activation | +| MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(1-7,8-9,10,11,12+) | need perf optimization; need implementing activation | | MemcpyFromHost | ai.onnx(1+) | | | MemcpyToHost | ai.onnx(1+) | | | Mul | ai.onnx(7-12,13,14+) | | @@ -79,7 +79,7 @@ Do not modify directly.* | ReduceSumSquare | ai.onnx(1-10,11-12,13-17,18+) | | | Relu | ai.onnx(6-12,13,14+) | | | Reshape | ai.onnx(5-12,13,14+) | no GPU kernel | -| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling | +| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling | | Shape | ai.onnx(1-12,13-14,15+) | no GPU kernel; an ORT warning is generated - need to fix | | Sigmoid | ai.onnx(6-12,13+) | | | Sin | ai.onnx(7+) | | diff --git a/web/test/test-runner.ts b/web/test/test-runner.ts index 628e5408150f8..29acc07e118f9 100644 --- a/web/test/test-runner.ts +++ b/web/test/test-runner.ts @@ -164,7 +164,10 @@ async function initializeSession( session = await ort.InferenceSession.create(modelFilePath, sessionConfig); } } catch (e) { - Logger.error('TestRunner', `Failed to load model from file: ${modelFilePath}. Error: ${inspect(e)}`); + Logger.error( + 'TestRunner', + `Failed to load model from file: ${modelFilePath}. ` + + `Error: ${e.message} @ ${e.fileName}:${e.lineNumber}`); throw e; }