From 5de58db9892c6d5c87c02cb9fbdcf2d8ec398ac6 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Wed, 25 Sep 2024 00:47:29 -0700 Subject: [PATCH] Parameterized API for unary operators PiperOrigin-RevId: 678574731 --- BUILD.bazel | 6 +- CMakeLists.txt | 103 +- bench/BUILD.bazel | 24 - bench/abs.cc | 8 +- bench/bankers-rounding.cc | 6 +- bench/ceiling.cc | 8 +- bench/convert.cc | 48 +- bench/elu.cc | 30 +- bench/f16-f32-vcvt.cc | 38 - bench/f16-qs8-vcvt.cc | 46 - bench/f32-f16-vcvt.cc | 38 - bench/f32-qs8-vcvt.cc | 40 - bench/f32-qu8-vcvt.cc | 41 - bench/floor.cc | 8 +- bench/hardswish.cc | 8 +- bench/leaky-relu.cc | 42 +- bench/models/fp32-attention.cc | 4 +- bench/models/fp32-mobilenet-v3-large.cc | 84 +- bench/models/fp32-mobilenet-v3-small.cc | 76 +- bench/models/qd8-attention.cc | 4 +- bench/models/qs8-mobilenet-v2.cc | 8 +- bench/negate.cc | 8 +- bench/qs16-qs8-vcvt.cc | 39 - bench/qs8-f16-vcvt.cc | 43 - bench/qs8-f32-vcvt.cc | 41 - bench/qs8-vcvt.cc | 40 - bench/qu8-f32-vcvt.cc | 41 - bench/qu8-vcvt.cc | 39 - bench/reciprocal-square-root.cc | 4 +- bench/s32-f32-vcvt.cc | 39 - bench/sigmoid.cc | 28 +- bench/square-root.cc | 7 +- bench/square.cc | 8 +- bench/tanh.cc | 26 +- bench/truncation.cc | 27 - bench/u32-f32-vcvt.cc | 44 - bench/unary_operator.h | 23 +- bench/vcvt-benchmark.h | 67 - bench/vunary.cc | 219 +- build_srcs.bzl | 20 +- include/xnnpack.h | 1404 +---- scripts/generate-tests.sh | 88 +- src/bf16-vabs/bf16-vabs.h | 6 +- src/configs/unary-elementwise-config.c | 419 +- src/f16-vabs/f16-vabs.h | 8 +- src/f16-vclamp/f16-vclamp.h | 16 +- src/f16-vhswish/f16-vhswish.h | 8 +- src/f16-vneg/f16-vneg.h | 8 +- src/f16-vrnd/f16-vrndd.h | 8 +- src/f16-vrnd/f16-vrndne.h | 8 +- src/f16-vrnd/f16-vrndu.h | 8 +- src/f16-vrnd/f16-vrndz.h | 8 +- src/f16-vrsqrt/f16-vrsqrt.h | 12 +- src/f16-vsigmoid/f16-vsigmoid.h | 80 +- src/f16-vsqr/f16-vsqr.h | 8 +- src/f16-vsqrt/f16-vsqrt.h | 42 +- src/f32-vabs/f32-vabs.h | 50 +- src/f32-vclamp/f32-vclamp.h | 46 +- src/f32-vexp/f32-vexp.h | 6 +- src/f32-vgelu/f32-vgelu.h | 76 +- src/f32-vhswish/f32-vhswish.h | 48 +- src/f32-vlog/f32-vlog.h | 78 +- src/f32-vneg/f32-vneg.h | 50 +- src/f32-vrelu/f32-vrelu.h | 52 +- src/f32-vrnd/f32-vrndd.h | 42 +- src/f32-vrnd/f32-vrndne.h | 42 +- src/f32-vrnd/f32-vrndu.h | 42 +- src/f32-vrnd/f32-vrndz.h | 42 +- src/f32-vrsqrt/f32-vrsqrt.h | 42 +- src/f32-vsigmoid/f32-vsigmoid.h | 538 +- src/f32-vsqr/f32-vsqr.h | 50 +- src/f32-vsqrt/f32-vsqrt.h | 62 +- src/f32-vtanh/f32-vtanh.h | 96 +- src/microparams-init.c | 408 +- src/operator-delete.c | 28 +- src/operator-run.c | 46 +- src/operator-utils.c | 75 + src/operators/lut-elementwise-nc.c | 559 -- .../scaled-dot-product-attention-nhtc.c | 16 +- src/operators/unary-elementwise-nc.c | 4531 ++++------------- src/s8-vclamp/s8-vclamp.h | 22 +- src/subgraph.c | 101 +- src/subgraph/abs.c | 207 - src/subgraph/bankers-rounding.c | 210 - src/subgraph/ceiling.c | 209 - src/subgraph/clamp.c | 292 -- src/subgraph/deprecated.c | 123 + src/subgraph/elu.c | 254 - src/subgraph/exp.c | 195 - src/subgraph/floor.c | 208 - src/subgraph/gelu.c | 191 - src/subgraph/hardswish.c | 210 - src/subgraph/leaky-relu.c | 305 -- src/subgraph/log.c | 195 - src/subgraph/negate.c | 209 - src/subgraph/reciprocal-square-root.c | 203 - src/subgraph/sigmoid.c | 277 - src/subgraph/square-root.c | 212 - src/subgraph/square.c | 206 - src/subgraph/tanh.c | 276 - src/subgraph/{convert.c => unary.c} | 505 +- src/u8-vclamp/u8-vclamp.h | 20 +- src/xnnpack/buffer.h | 57 +- src/xnnpack/compute.h | 70 +- src/xnnpack/config-types.h | 42 +- src/xnnpack/microfnptr.h | 154 +- src/xnnpack/microparams-init.h | 163 +- src/xnnpack/microparams.h | 28 + src/xnnpack/operator-type-defs.h | 71 +- src/xnnpack/operator-utils.h | 2 + src/xnnpack/operator.h | 46 +- src/xnnpack/subgraph.h | 10 +- test/BUILD.bazel | 195 +- test/abs-nc.cc | 35 - test/abs-reshape.cc | 68 - test/bankers-rounding-nc.cc | 35 - test/bankers-rounding.cc | 200 - test/batch-matrix-multiply.cc | 2 +- test/bf16-vabs.cc | 16 +- test/ceiling-nc.cc | 35 - test/ceiling.cc | 198 - test/clamp-nc.cc | 296 -- test/clamp.cc | 424 -- test/convert-nc-eager.cc | 535 -- test/convert-nc.cc | 877 ++-- test/convert-operator-tester.h | 991 ---- test/convert.cc | 945 ---- test/convolution-2d.cc | 4 +- test/deconvolution-2d.cc | 2 +- test/elu-nc.cc | 156 - test/elu.cc | 309 -- test/exp-nc.cc | 30 - test/exp.cc | 109 - test/f16-f32-vcvt.cc | 18 +- test/f16-qs8-vcvt.cc | 27 +- test/f16-vabs.cc | 16 +- test/f16-vclamp.cc | 55 +- test/f16-velu.cc | 73 +- test/f16-vhswish.cc | 16 +- test/f16-vlrelu.cc | 43 +- test/f16-vneg.cc | 16 +- test/f16-vrndd.cc | 16 +- test/f16-vrndne.cc | 16 +- test/f16-vrndu.cc | 16 +- test/f16-vrndz.cc | 16 +- test/f16-vrsqrt.cc | 16 +- test/f16-vsigmoid.cc | 16 +- test/f16-vsqr.cc | 16 +- test/f16-vsqrt.cc | 16 +- test/f16-vtanh.cc | 16 +- test/f32-f16-vcvt.cc | 18 +- test/f32-qs8-vcvt.cc | 26 +- test/f32-qu8-vcvt.cc | 26 +- test/f32-vabs.cc | 16 +- test/f32-vclamp.cc | 55 +- test/f32-velu.cc | 73 +- test/f32-vexp.cc | 52 +- test/f32-vgelu.cc | 52 +- test/f32-vhswish.cc | 16 +- test/f32-vlog.cc | 52 +- test/f32-vlrelu.cc | 43 +- test/f32-vneg.cc | 16 +- test/f32-vrelu.cc | 16 +- test/f32-vrndd.cc | 16 +- test/f32-vrndne.cc | 16 +- test/f32-vrndu.cc | 16 +- test/f32-vrndz.cc | 16 +- test/f32-vrsqrt.cc | 16 +- test/f32-vsigmoid.cc | 16 +- test/f32-vsqr.cc | 16 +- test/f32-vsqrt.cc | 52 +- test/f32-vtanh.cc | 52 +- test/floor-nc.cc | 35 - test/floor.cc | 198 - test/fully-connected.cc | 24 +- test/gelu-nc.cc | 35 - test/gelu.cc | 125 - test/hardswish-nc.cc | 50 - test/hardswish.cc | 211 - test/leaky-relu-nc.cc | 231 - test/leaky-relu.cc | 417 -- test/log-nc.cc | 38 - test/log.cc | 109 - test/negate-nc.cc | 33 - test/negate.cc | 198 - test/qs16-qs8-vcvt.cc | 26 +- test/qs8-f16-vcvt.cc | 24 +- test/qs8-f32-vcvt.cc | 24 +- test/qs8-vcvt.cc | 26 +- test/qs8-vhswish.cc | 69 +- test/qs8-vlrelu.cc | 60 +- test/qu8-f32-vcvt.cc | 24 +- test/qu8-vcvt.cc | 26 +- test/qu8-vhswish.cc | 75 +- test/qu8-vlrelu.cc | 60 +- test/reciprocal-square-root-nc.cc | 44 - test/reciprocal-square-root.cc | 239 - test/s32-f32-vcvt.cc | 20 +- test/s8-vclamp.cc | 55 +- test/sigmoid-nc.cc | 60 - test/sigmoid.cc | 404 -- test/square-nc.cc | 46 - test/square-root-nc.cc | 48 - test/square-root.cc | 200 - test/square.cc | 200 - test/subgraph-tester.h | 17 +- test/tanh-nc.cc | 53 - test/tanh-operator-tester.h | 381 -- test/tanh.cc | 404 -- test/truncation-nc.cc | 35 - test/u32-f32-vcvt.cc | 20 +- test/u8-vclamp.cc | 55 +- test/unary-elementwise-nc.cc | 440 ++ test/unary-operator-tester.cc | 280 - test/unary-operator-tester.h | 719 --- test/unary-ops.cc | 70 + test/unary-ops.h | 472 ++ test/unary.cc | 251 + test/vcvt-microkernel-tester.cc | 550 -- test/vcvt-microkernel-tester.h | 256 - test/vhswish-microkernel-tester.h | 266 - test/vlrelu-microkernel-tester.h | 267 - test/vunary-microkernel-tester.cc | 430 -- test/vunary-microkernel-tester.h | 640 ++- test/workspace.cc | 2 +- tools/generate-vunary-test.py | 253 +- 226 files changed, 6204 insertions(+), 25500 deletions(-) delete mode 100644 bench/f16-f32-vcvt.cc delete mode 100644 bench/f16-qs8-vcvt.cc delete mode 100644 bench/f32-f16-vcvt.cc delete mode 100644 bench/f32-qs8-vcvt.cc delete mode 100644 bench/f32-qu8-vcvt.cc delete mode 100644 bench/qs16-qs8-vcvt.cc delete mode 100644 bench/qs8-f16-vcvt.cc delete mode 100644 bench/qs8-f32-vcvt.cc delete mode 100644 bench/qs8-vcvt.cc delete mode 100644 bench/qu8-f32-vcvt.cc delete mode 100644 bench/qu8-vcvt.cc delete mode 100644 bench/s32-f32-vcvt.cc delete mode 100644 bench/truncation.cc delete mode 100644 bench/u32-f32-vcvt.cc delete mode 100644 bench/vcvt-benchmark.h delete mode 100644 src/operators/lut-elementwise-nc.c delete mode 100644 src/subgraph/abs.c delete mode 100644 src/subgraph/bankers-rounding.c delete mode 100644 src/subgraph/ceiling.c delete mode 100644 src/subgraph/clamp.c delete mode 100644 src/subgraph/elu.c delete mode 100644 src/subgraph/exp.c delete mode 100644 src/subgraph/floor.c delete mode 100644 src/subgraph/gelu.c delete mode 100644 src/subgraph/hardswish.c delete mode 100644 src/subgraph/leaky-relu.c delete mode 100644 src/subgraph/log.c delete mode 100644 src/subgraph/negate.c delete mode 100644 src/subgraph/reciprocal-square-root.c delete mode 100644 src/subgraph/sigmoid.c delete mode 100644 src/subgraph/square-root.c delete mode 100644 src/subgraph/square.c delete mode 100644 src/subgraph/tanh.c rename src/subgraph/{convert.c => unary.c} (50%) delete mode 100644 test/abs-nc.cc delete mode 100644 test/abs-reshape.cc delete mode 100644 test/bankers-rounding-nc.cc delete mode 100644 test/bankers-rounding.cc delete mode 100644 test/ceiling-nc.cc delete mode 100644 test/ceiling.cc delete mode 100644 test/clamp-nc.cc delete mode 100644 test/clamp.cc delete mode 100644 test/convert-nc-eager.cc delete mode 100644 test/convert.cc delete mode 100644 test/elu-nc.cc delete mode 100644 test/elu.cc delete mode 100644 test/exp-nc.cc delete mode 100644 test/exp.cc delete mode 100644 test/floor-nc.cc delete mode 100644 test/floor.cc delete mode 100644 test/gelu-nc.cc delete mode 100644 test/gelu.cc delete mode 100644 test/hardswish-nc.cc delete mode 100644 test/hardswish.cc delete mode 100644 test/leaky-relu-nc.cc delete mode 100644 test/leaky-relu.cc delete mode 100644 test/log-nc.cc delete mode 100644 test/log.cc delete mode 100644 test/negate-nc.cc delete mode 100644 test/negate.cc delete mode 100644 test/reciprocal-square-root-nc.cc delete mode 100644 test/reciprocal-square-root.cc delete mode 100644 test/sigmoid-nc.cc delete mode 100644 test/sigmoid.cc delete mode 100644 test/square-nc.cc delete mode 100644 test/square-root-nc.cc delete mode 100644 test/square-root.cc delete mode 100644 test/square.cc delete mode 100644 test/tanh-nc.cc delete mode 100644 test/tanh-operator-tester.h delete mode 100644 test/tanh.cc delete mode 100644 test/truncation-nc.cc create mode 100644 test/unary-elementwise-nc.cc delete mode 100644 test/unary-operator-tester.cc delete mode 100644 test/unary-operator-tester.h create mode 100644 test/unary-ops.cc create mode 100644 test/unary-ops.h create mode 100644 test/unary.cc delete mode 100644 test/vcvt-microkernel-tester.cc delete mode 100644 test/vcvt-microkernel-tester.h delete mode 100644 test/vhswish-microkernel-tester.h delete mode 100644 test/vlrelu-microkernel-tester.h delete mode 100644 test/vunary-microkernel-tester.cc diff --git a/BUILD.bazel b/BUILD.bazel index 12ed5539f196..77c411d928ed 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -487,6 +487,7 @@ xnnpack_cc_library( ":logging", ":math", ":microparams", + ":requantization", ":unaligned", ":xnnpack_h", ], @@ -973,6 +974,7 @@ xnnpack_cc_library( ":microkernel_utils", ":microkernels_h", ":microparams_init", + ":node_type", ":normalization", ":operator_type", ":operator_utils", @@ -981,9 +983,7 @@ xnnpack_cc_library( ":quantization", ":xnnpack_h", "@pthreadpool", - ] + select({ - "//conditions:default": [], - }), + ], ) xnnpack_cc_library( diff --git a/CMakeLists.txt b/CMakeLists.txt index 5694c7ac6ca9..a0bcd0f92951 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -425,7 +425,6 @@ SET(OPERATOR_SRCS src/operators/deconvolution-nhwc.c src/operators/dynamic-fully-connected-nc.c src/operators/fully-connected-nc.c - src/operators/lut-elementwise-nc.c src/operators/max-pooling-nhwc.c src/operators/pack-lh.c src/operators/reduce-nd.c @@ -443,50 +442,32 @@ SET(SUBGRAPH_SRCS src/memory-planner.c src/runtime.c src/subgraph.c - src/subgraph/abs.c src/subgraph/argmax-pooling-2d.c src/subgraph/average-pooling-2d.c - src/subgraph/bankers-rounding.c src/subgraph/batch-matrix-multiply.c src/subgraph/binary.c - src/subgraph/ceiling.c - src/subgraph/clamp.c src/subgraph/concatenate.c - src/subgraph/convert.c src/subgraph/convolution-2d.c src/subgraph/copy.c src/subgraph/deconvolution-2d.c src/subgraph/deprecated.c src/subgraph/depth-to-space-2d.c src/subgraph/depthwise-convolution-2d.c - src/subgraph/elu.c src/subgraph/even-split.c - src/subgraph/exp.c - src/subgraph/floor.c src/subgraph/fully-connected-sparse.c src/subgraph/fully-connected.c - src/subgraph/gelu.c - src/subgraph/hardswish.c - src/subgraph/leaky-relu.c - src/subgraph/log.c src/subgraph/max-pooling-2d.c - src/subgraph/negate.c src/subgraph/pack-lh.c - src/subgraph/reciprocal-square-root.c src/subgraph/reshape-helpers.c src/subgraph/scaled-dot-product-attention.c - src/subgraph/sigmoid.c - src/subgraph/softmax.c src/subgraph/space-to-depth-2d.c - src/subgraph/square-root.c - src/subgraph/square.c src/subgraph/static-constant-pad.c src/subgraph/static-reduce.c src/subgraph/static-resize-bilinear-2d.c src/subgraph/static-slice.c src/subgraph/static-transpose.c - src/subgraph/tanh.c src/subgraph/unpooling-2d.c + src/subgraph/unary.c src/subgraph/validation.c src/tensor.c) @@ -1231,10 +1212,6 @@ IF(XNNPACK_BUILD_TESTS) ENDIF() TARGET_LINK_LIBRARIES(gemm-microkernel-tester PUBLIC next-prime) - ADD_LIBRARY(unary-operator-tester STATIC test/unary-operator-tester.cc) - TARGET_INCLUDE_DIRECTORIES(unary-operator-tester PRIVATE include src test) - TARGET_LINK_LIBRARIES(unary-operator-tester PRIVATE XNNPACK pthreadpool GTest::gtest) - ADD_LIBRARY(dwconv-microkernel-tester STATIC test/dwconv-microkernel-tester.cc) TARGET_INCLUDE_DIRECTORIES(dwconv-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(dwconv-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) @@ -1245,15 +1222,6 @@ IF(XNNPACK_BUILD_TESTS) TARGET_INCLUDE_DIRECTORIES(vbinary-microkernel-tester PRIVATE include src test) TARGET_LINK_LIBRARIES(vbinary-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) - ADD_LIBRARY(vcvt-microkernel-tester STATIC test/vcvt-microkernel-tester.cc) - TARGET_INCLUDE_DIRECTORIES(vcvt-microkernel-tester PRIVATE include src test) - TARGET_LINK_LIBRARIES(vcvt-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) - - ADD_LIBRARY(vunary-microkernel-tester STATIC test/vunary-microkernel-tester.cc) - TARGET_INCLUDE_DIRECTORIES(vunary-microkernel-tester PRIVATE include src test) - TARGET_LINK_LIBRARIES(vunary-microkernel-tester PRIVATE XNNPACK pthreadpool GTest::gtest) - TARGET_LINK_LIBRARIES(vunary-microkernel-tester PUBLIC next-prime) - ADD_LIBRARY(convolution-test-helpers OBJECT test/convolution-test-helpers.cc) TARGET_INCLUDE_DIRECTORIES(convolution-test-helpers PRIVATE include src) TARGET_LINK_LIBRARIES(convolution-test-helpers PRIVATE xnnpack-base) @@ -1288,7 +1256,6 @@ IF(XNNPACK_BUILD_TESTS) microparams-init next-prime pthreadpool - vunary-microkernel-tester XNNPACK) ADD_SHARDED_TEST(${TEST}-test 10) ENDFOREACH() @@ -1318,34 +1285,13 @@ IF(XNNPACK_BUILD_TESTS) TARGET_LINK_LIBRARIES(subgraph-size-test PRIVATE XNNPACK) # ---[ Build operator-level unit tests - SET(LIBRARY_UNARY_UNIT_TESTS - abs-nc - bankers-rounding-nc - ceiling-nc - clamp-nc - elu-nc - exp-nc - floor-nc - hardswish-nc - leaky-relu-nc - log-nc - negate-nc - reciprocal-square-root-nc - sigmoid-nc - square-nc - square-root-nc - tanh-nc - truncation-nc) - FOREACH(TEST ${LIBRARY_UNARY_UNIT_TESTS}) - ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) - TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE src test) - TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE - GTest::gtest - GTest::gtest_main - unary-operator-tester - XNNPACK) - ADD_TEST(NAME ${TEST}-test COMMAND ${TEST}-test) - ENDFOREACH() + ADD_EXECUTABLE(unary-elementwise-nc-test test/unary-elementwise-nc.cc) + TARGET_INCLUDE_DIRECTORIES(unary-elementwise-nc-test PRIVATE src test) + TARGET_LINK_LIBRARIES(unary-elementwise-nc-test PRIVATE + GTest::gtest + GTest::gtest_main + XNNPACK) + ADD_TEST(NAME unary-elementwise-nc-test COMMAND unary-elementwise-nc-test) ADD_EXECUTABLE(binary-elementwise-nd-test test/binary-elementwise-nd.cc) TARGET_INCLUDE_DIRECTORIES(binary-elementwise-nd-test PRIVATE src test) @@ -1376,53 +1322,35 @@ IF(XNNPACK_BUILD_TESTS) # ---[ Build subgraph-level unit tests SET(LIBRARY_SUBGRAPH_UNIT_TESTS - abs - abs-reshape argmax-pooling-2d average-pooling-2d average-pooling-2d-reshape - bankers-rounding binary - ceiling - clamp concatenate2 concatenate3 concatenate4 concatenate5 - convert copy depth-to-space-2d - elu - gelu - exp even-split2 even-split3 even-split4 - floor global-average-pooling-1d global-average-pooling-2d global-sum-pooling-1d global-sum-pooling-2d - hardswish - leaky-relu - log max-pooling-2d - negate - reciprocal-square-root reshape-helpers - sigmoid static-slice softmax space-to-depth-2d - square - square-root static-constant-pad static-reduce static-reshape static-resize-bilinear-2d static-transpose - tanh transpose-reshape + unary unpooling-2d) FOREACH(TEST ${LIBRARY_SUBGRAPH_UNIT_TESTS}) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) @@ -1747,7 +1675,6 @@ IF(XNNPACK_BUILD_TESTS) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE - vcvt-microkernel-tester GTest::gmock GTest::gtest GTest::gtest_main @@ -1800,7 +1727,6 @@ IF(XNNPACK_BUILD_TESTS) ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc) TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test) TARGET_LINK_LIBRARIES(${TEST}-test PRIVATE - vunary-microkernel-tester GTest::gmock GTest::gtest GTest::gtest_main @@ -1945,7 +1871,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) softmax square square-root - truncation tanh) FOREACH(BENCH ${LIBRARY_OPERATOR_BENCHMARKS}) ADD_EXECUTABLE(${BENCH}-bench bench/${BENCH}.cc) @@ -1963,7 +1888,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) f16-conv-hwc2chw f16-dwconv f16-dwconv2d-chw - f16-f32-vcvt f16-f32acc-gemm f16-f32acc-igemm f16-f32acc-rdsum @@ -1982,7 +1906,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) f32-conv-hwc2chw f32-dwconv f32-dwconv2d-chw - f32-f16-vcvt f32-gemm f32-gemm-goi-minmax f32-gemm-minmax @@ -1990,8 +1913,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) f32-im2col-gemm f32-qc4w-gemm f32-qc8w-gemm - f32-qs8-vcvt - f32-qu8-vcvt f32-raddexpminusmax f32-raddextexp f32-raddstoreexpminusmax @@ -2014,21 +1935,15 @@ IF(XNNPACK_BUILD_BENCHMARKS) qd8-f32-qc8w-gemm qp8-f32-qc4w-gemm qp8-f32-qb4w-gemm - qs16-qs8-vcvt qs8-dwconv - qs8-f16-vcvt - qs8-f32-vcvt qs8-gemm qs8-qc8w-gemm-fp32 qu8-rdsum qs8-rsum qu8-rsum - qs8-vcvt - qu8-f32-vcvt qu8-gemm qu8-gemm-fp32 qu8-gemm-rndnu - qu8-vcvt x16-packw x32-packw x8-lut diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index 8efbd1f459c9..921b26dbdf8b 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -228,29 +228,6 @@ xnnpack_benchmark( ]), ) -[xnnpack_benchmark( - name = "%s_bench" % kernel, - srcs = [ - "%s.cc" % kernel.replace("_", "-"), - "vcvt-benchmark.h", - ], - deps = MICROKERNEL_BENCHMARK_DEPS, -) for kernel in [ - "qs8_f16_vcvt", - "qs8_f32_vcvt", - "qs8_vcvt", - "qs16_qs8_vcvt", - "qu8_f32_vcvt", - "qu8_vcvt", - "f16_f32_vcvt", - "f16_qs8_vcvt", - "f32_f16_vcvt", - "f32_qs8_vcvt", - "f32_qu8_vcvt", - "s32_f32_vcvt", - "u32_f32_vcvt", -]] - [xnnpack_benchmark( name = "%s_bench" % kernel, srcs = [ @@ -590,7 +567,6 @@ xnnpack_benchmark( "square", "square_root", "tanh", - "truncation", ]] xnnpack_benchmark( diff --git a/bench/abs.cc b/bench/abs.cc index 03864613091d..f4635c6e2b7e 100644 --- a/bench/abs.cc +++ b/bench/abs.cc @@ -14,15 +14,11 @@ static void xnnpack_abs_f16(benchmark::State& state) { - benchmark_unary_operator( - xnn_create_abs_nc_f16, xnn_reshape_abs_nc_f16, xnn_setup_abs_nc_f16, - state); + benchmark_unary_operator(state, xnn_unary_abs); } static void xnnpack_abs_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_abs_nc_f32, - xnn_reshape_abs_nc_f32, - xnn_setup_abs_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_abs); } BENCHMARK(xnnpack_abs_f16) diff --git a/bench/bankers-rounding.cc b/bench/bankers-rounding.cc index 870de8370617..4eb7926d98f0 100644 --- a/bench/bankers-rounding.cc +++ b/bench/bankers-rounding.cc @@ -14,14 +14,12 @@ static void xnnpack_bankers_rounding_f16(benchmark::State& state) { benchmark_unary_operator( - xnn_create_bankers_rounding_nc_f16, xnn_reshape_bankers_rounding_nc_f16, - xnn_setup_bankers_rounding_nc_f16, state); + state, xnn_unary_bankers_rounding); } static void xnnpack_bankers_rounding_f32(benchmark::State& state) { benchmark_unary_operator( - xnn_create_bankers_rounding_nc_f32, xnn_reshape_bankers_rounding_nc_f32, - xnn_setup_bankers_rounding_nc_f32, state); + state, xnn_unary_bankers_rounding); } BENCHMARK(xnnpack_bankers_rounding_f16) diff --git a/bench/ceiling.cc b/bench/ceiling.cc index da08f7bb6d79..3f20bb99b5a0 100644 --- a/bench/ceiling.cc +++ b/bench/ceiling.cc @@ -14,15 +14,11 @@ static void xnnpack_ceiling_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_ceiling_nc_f16, - xnn_reshape_ceiling_nc_f16, - xnn_setup_ceiling_nc_f16, state); + benchmark_unary_operator(state, xnn_unary_ceiling); } static void xnnpack_ceiling_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_ceiling_nc_f32, - xnn_reshape_ceiling_nc_f32, - xnn_setup_ceiling_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_ceiling); } BENCHMARK(xnnpack_ceiling_f16) diff --git a/bench/convert.cc b/bench/convert.cc index 7c1725dc875a..4e49f2ad957f 100644 --- a/bench/convert.cc +++ b/bench/convert.cc @@ -3,7 +3,7 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -#include +#include #include "unary_operator.h" #include "utils.h" @@ -18,71 +18,41 @@ void xnnpack_convert_f16_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_convert_nc_f16_f32, - xnn_reshape_convert_nc_f16_f32, - xnn_setup_convert_nc_f16_f32, state); + benchmark_unary_operator(state, xnn_unary_convert); } void xnnpack_convert_f32_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_convert_nc_f32_f16, - xnn_reshape_convert_nc_f32_f16, - xnn_setup_convert_nc_f32_f16, state); + benchmark_unary_operator(state, xnn_unary_convert); } void xnnpack_convert_f32_qs8(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_convert_nc_f32_qs8( - 1.0f / 128.0f /* scale */, 1 /* zero point */, flags, op); - }, - xnn_reshape_convert_nc_f32_qs8, xnn_setup_convert_nc_f32_qs8, state); + state, xnn_unary_convert, nullptr, {0, 1.0f}, {1, 1.0f / 128.0f}); } void xnnpack_convert_f32_qu8(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_convert_nc_f32_qu8( - 1.0f / 128.0f /* scale */, 127 /* zero point */, flags, op); - }, - xnn_reshape_convert_nc_f32_qu8, xnn_setup_convert_nc_f32_qu8, state); + state, xnn_unary_convert, nullptr, {0, 1.0f}, {127, 1.0f / 128.0f}); } void xnnpack_convert_qs8(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_convert_nc_qs8( - 0.75f /* input scale */, -1 /* input zero point */, - 0.5f /* output scale */, 1 /* output zero point */, flags, op); - }, - xnn_reshape_convert_nc_qs8, xnn_setup_convert_nc_qs8, state); + state, xnn_unary_convert, nullptr, {-1, 0.75f}, {1, 0.5f}); } void xnnpack_convert_qs8_f32(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_convert_nc_qs8_f32(1.0f / 255.0f /* scale */, - -128 /* zero point */, flags, op); - }, - xnn_reshape_convert_nc_qs8_f32, xnn_setup_convert_nc_qs8_f32, state); + state, xnn_unary_convert, nullptr, {-128, 1.0f / 255.0f}); } void xnnpack_convert_qu8(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_convert_nc_qu8(0.75f /* scale */, - 125 /* zero point */, 0.5f /* scale */, - 130 /* zero point */, flags, op); - }, - xnn_reshape_convert_nc_qu8, xnn_setup_convert_nc_qu8, state); + state, xnn_unary_convert, nullptr, {125, 0.75f}, {130, 0.5f}); } void xnnpack_convert_qu8_f32(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_convert_nc_qu8_f32(1.0f / 128.0f /* scale */, - 128 /* zero point */, flags, op); - }, - xnn_reshape_convert_nc_qu8_f32, xnn_setup_convert_nc_qu8_f32, state); + state, xnn_unary_convert, nullptr, {128, 1.0f / 128.0f}); } #ifdef BENCHMARK_TENSORFLOW_LITE diff --git a/bench/elu.cc b/bench/elu.cc index 7912502bc91c..27909fe031ec 100644 --- a/bench/elu.cc +++ b/bench/elu.cc @@ -18,33 +18,21 @@ static void xnnpack_elu_f16(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_elu_nc_f16( - /*alpha=*/1.0f, flags, op); - }, - xnn_reshape_elu_nc_f16, xnn_setup_elu_nc_f16, state); + xnn_unary_params params; + params.elu.alpha = 1.0f; + benchmark_unary_operator(state, xnn_unary_elu, ¶ms); } static void xnnpack_elu_f32(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_elu_nc_f32( - /*alpha=*/1.0f, flags, op); - }, - xnn_reshape_elu_nc_f32, xnn_setup_elu_nc_f32, state); + xnn_unary_params params; + params.elu.alpha = 1.0f; + benchmark_unary_operator(state, xnn_unary_elu, ¶ms); } static void xnnpack_elu_qs8(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_elu_nc_qs8( - 1.0f /* alpha */, 0 /* input zero point */, 1.0f /* input scale */, - 0 /* output zero point */, 1.0f /* output scale */, - std::numeric_limits::min(), - std::numeric_limits::max(), flags, op); - }, - xnn_reshape_elu_nc_qs8, xnn_setup_elu_nc_qs8, state); + xnn_unary_params params; + params.elu.alpha = 1.0f; + benchmark_unary_operator(state, xnn_unary_elu, ¶ms, {0, 1.0f}, {0, 1.0f}); } BENCHMARK(xnnpack_elu_f16) diff --git a/bench/f16-f32-vcvt.cc b/bench/f16-f32-vcvt.cc deleted file mode 100644 index 07dacd35854b..000000000000 --- a/bench/f16-f32-vcvt.cc +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/microparams.h" -#include "xnnpack/vcvt.h" -#include - -static void f16_f32_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_f16_f32_vcvt_ukernel_fn cvt, - const void* /*init_params*/ = nullptr) -{ - cvt_benchmark(state, arch_flags, cvt, nullptr); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(f16_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "f16-f32-vcvt/f16-f32-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/f16-qs8-vcvt.cc b/bench/f16-qs8-vcvt.cc deleted file mode 100644 index 71f00b7f807b..000000000000 --- a/bench/f16-qs8-vcvt.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void f16_qs8_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_f16_qs8_vcvt_ukernel_fn cvt, - xnn_init_f16_qs8_cvt_params_fn init_params) -{ - xnn_f16_qs8_cvt_params params; - init_params(¶ms, - 1.0f /* scale */, - 1 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(f16_qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "f16-qs8-vcvt/f16-qs8-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/f32-f16-vcvt.cc b/bench/f32-f16-vcvt.cc deleted file mode 100644 index b5a16001e873..000000000000 --- a/bench/f32-f16-vcvt.cc +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/microparams.h" -#include "xnnpack/vcvt.h" -#include - -static void f32_f16_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_f32_f16_vcvt_ukernel_fn cvt, - const void* /*init_params*/ = nullptr) -{ - cvt_benchmark(state, arch_flags, cvt, nullptr); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(f32_f16_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "f32-f16-vcvt/f32-f16-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/f32-qs8-vcvt.cc b/bench/f32-qs8-vcvt.cc deleted file mode 100644 index e622edb2a09a..000000000000 --- a/bench/f32-qs8-vcvt.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void f32_qs8_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_f32_qs8_vcvt_ukernel_fn cvt, - xnn_init_f32_qs8_cvt_params_fn init_params) -{ - xnn_f32_qs8_cvt_params params; - init_params(¶ms, - 25.0f /* scale */, - 1 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(f32_qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "f32-qs8-vcvt/f32-qs8-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/f32-qu8-vcvt.cc b/bench/f32-qu8-vcvt.cc deleted file mode 100644 index 7110d9f14436..000000000000 --- a/bench/f32-qu8-vcvt.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void f32_qu8_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_f32_qu8_vcvt_ukernel_fn cvt, - xnn_init_f32_qu8_cvt_params_fn init_params) -{ - xnn_f32_qu8_cvt_params params; - init_params(¶ms, - 25.0f /* scale */, - 127 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(f32_qu8_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "f32-qu8-vcvt/f32-qu8-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/floor.cc b/bench/floor.cc index 06752a0b85cd..f05983e11e44 100644 --- a/bench/floor.cc +++ b/bench/floor.cc @@ -14,15 +14,11 @@ static void xnnpack_floor_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_floor_nc_f16, - xnn_reshape_floor_nc_f16, - xnn_setup_floor_nc_f16, state); + benchmark_unary_operator(state, xnn_unary_floor); } static void xnnpack_floor_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_floor_nc_f32, - xnn_reshape_floor_nc_f32, - xnn_setup_floor_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_floor); } BENCHMARK(xnnpack_floor_f16) diff --git a/bench/hardswish.cc b/bench/hardswish.cc index a666346f4738..28bc92b82de6 100644 --- a/bench/hardswish.cc +++ b/bench/hardswish.cc @@ -14,15 +14,11 @@ static void xnnpack_hardswish_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_hardswish_nc_f16, - xnn_reshape_hardswish_nc_f16, - xnn_setup_hardswish_nc_f16, state); + benchmark_unary_operator(state, xnn_unary_hardswish); } static void xnnpack_hardswish_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_hardswish_nc_f32, - xnn_reshape_hardswish_nc_f32, - xnn_setup_hardswish_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_hardswish); } BENCHMARK(xnnpack_hardswish_f16) diff --git a/bench/leaky-relu.cc b/bench/leaky-relu.cc index 4c4192868483..403bd711c163 100644 --- a/bench/leaky-relu.cc +++ b/bench/leaky-relu.cc @@ -19,45 +19,27 @@ static void xnnpack_leaky_relu_f16(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_leaky_relu_nc_f16( - - 0.01f /* negative slope */, flags, op); - }, - xnn_reshape_leaky_relu_nc_f16, xnn_setup_leaky_relu_nc_f16, state); + xnn_unary_params params; + params.leaky_relu.negative_slope = 0.1f; + benchmark_unary_operator(state, xnn_unary_leaky_relu, ¶ms); } static void xnnpack_leaky_relu_f32(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_leaky_relu_nc_f32( - - 0.01f /* negative slope */, flags, op); - }, - xnn_reshape_leaky_relu_nc_f32, xnn_setup_leaky_relu_nc_f32, state); + xnn_unary_params params; + params.leaky_relu.negative_slope = 0.1f; + benchmark_unary_operator(state, xnn_unary_leaky_relu, ¶ms); } static void xnnpack_leaky_relu_qs8(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_leaky_relu_nc_qs8( - 0.1f /* negative slope */, 5 /* input zero point */, - 0.75f /* input scale */, -5 /* output zero point */, - 0.5f /* output scale */, flags, op); - }, - xnn_reshape_leaky_relu_nc_qs8, xnn_setup_leaky_relu_nc_qs8, state); + xnn_unary_params params; + params.leaky_relu.negative_slope = 0.1f; + benchmark_unary_operator(state, xnn_unary_leaky_relu, ¶ms, {5, 0.75f}, {-5, 0.5f}); } static void xnnpack_leaky_relu_qu8(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_leaky_relu_nc_qu8( - 0.1f /* negative slope */, 5 /* input zero point */, - 0.75f /* input scale */, -5 /* output zero point */, - 0.5f /* output scale */, flags, op); - }, - xnn_reshape_leaky_relu_nc_qu8, xnn_setup_leaky_relu_nc_qu8, state); + xnn_unary_params params; + params.leaky_relu.negative_slope = 0.1f; + benchmark_unary_operator(state, xnn_unary_leaky_relu, ¶ms, {125, 0.75f}, {128, 0.5f}); } BENCHMARK(xnnpack_leaky_relu_f16) diff --git a/bench/models/fp32-attention.cc b/bench/models/fp32-attention.cc index 2af45db5c40f..8a6546ff4ed4 100644 --- a/bench/models/fp32-attention.cc +++ b/bench/models/fp32-attention.cc @@ -314,8 +314,10 @@ xnn_subgraph_t FP32Attention(size_t b, size_t t, size_t h, size_t n, size_t s) { return nullptr; } - status = xnn_define_tanh( + status = xnn_define_unary( subgraph, + xnn_unary_tanh, + /*params=*/nullptr, v7, v8, 0); diff --git a/bench/models/fp32-mobilenet-v3-large.cc b/bench/models/fp32-mobilenet-v3-large.cc index 1dff3f8a40df..ed86ce7fd95e 100644 --- a/bench/models/fp32-mobilenet-v3-large.cc +++ b/bench/models/fp32-mobilenet-v3-large.cc @@ -3449,8 +3449,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v1, v2, /*flags=*/0); @@ -4115,8 +4117,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v39, v40, /*flags=*/0); @@ -4144,8 +4148,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v41, v42, /*flags=*/0); @@ -4194,8 +4200,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v44, v45, /*flags=*/0); @@ -4223,8 +4231,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v46, v47, /*flags=*/0); @@ -4287,8 +4297,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v50, v51, /*flags=*/0); @@ -4316,8 +4328,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v52, v53, /*flags=*/0); @@ -4380,8 +4394,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v56, v57, /*flags=*/0); @@ -4409,8 +4425,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v58, v59, /*flags=*/0); @@ -4473,8 +4491,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v62, v63, /*flags=*/0); @@ -4502,8 +4522,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v64, v65, /*flags=*/0); @@ -4634,8 +4656,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v72, v73, /*flags=*/0); @@ -4663,8 +4687,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v74, v75, /*flags=*/0); @@ -4809,8 +4835,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v83, v84, /*flags=*/0); @@ -4838,8 +4866,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v85, v86, /*flags=*/0); @@ -4970,8 +5000,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v93, v94, /*flags=*/0); @@ -4999,8 +5031,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v95, v96, /*flags=*/0); @@ -5145,8 +5179,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v104, v105, /*flags=*/0); @@ -5174,8 +5210,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v106, v107, /*flags=*/0); @@ -5320,8 +5358,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v115, v116, /*flags=*/0); @@ -5364,8 +5404,10 @@ xnn_subgraph_t FP32MobileNetV3Large() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v118, v119, /*flags=*/0); diff --git a/bench/models/fp32-mobilenet-v3-small.cc b/bench/models/fp32-mobilenet-v3-small.cc index e5d2d9a11ccd..2c4188296e59 100644 --- a/bench/models/fp32-mobilenet-v3-small.cc +++ b/bench/models/fp32-mobilenet-v3-small.cc @@ -3027,8 +3027,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v1, v2, /*flags=*/0); @@ -3310,8 +3312,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v17, v18, /*flags=*/0); @@ -3339,8 +3343,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v19, v20, /*flags=*/0); @@ -3471,8 +3477,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v27, v28, /*flags=*/0); @@ -3500,8 +3508,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v29, v30, /*flags=*/0); @@ -3646,8 +3656,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v38, v39, /*flags=*/0); @@ -3675,8 +3687,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v40, v41, /*flags=*/0); @@ -3821,8 +3835,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v49, v50, /*flags=*/0); @@ -3850,8 +3866,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v51, v52, /*flags=*/0); @@ -3982,8 +4000,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v59, v60, /*flags=*/0); @@ -4011,8 +4031,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v61, v62, /*flags=*/0); @@ -4157,8 +4179,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v70, v71, /*flags=*/0); @@ -4186,8 +4210,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v72, v73, /*flags=*/0); @@ -4318,8 +4344,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v80, v81, /*flags=*/0); @@ -4347,8 +4375,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v82, v83, /*flags=*/0); @@ -4493,8 +4523,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v91, v92, /*flags=*/0); @@ -4522,8 +4554,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v93, v94, /*flags=*/0); @@ -4668,8 +4702,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v102, v103, /*flags=*/0); @@ -4712,8 +4748,10 @@ xnn_subgraph_t FP32MobileNetV3Small() { return nullptr; } - status = xnn_define_hardswish( + status = xnn_define_unary( subgraph, + xnn_unary_hardswish, + /*params=*/nullptr, v105, v106, /*flags=*/0); diff --git a/bench/models/qd8-attention.cc b/bench/models/qd8-attention.cc index 2ce1a7162672..4176b311c282 100644 --- a/bench/models/qd8-attention.cc +++ b/bench/models/qd8-attention.cc @@ -57,7 +57,7 @@ xnn_subgraph_t QD8Attention(size_t batch_size, size_t seq_len, return nullptr; } - status = xnn_define_convert(subgraph, input_id, quantized_input_id, + status = xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, quantized_input_id, /*XNN_FLAG_MAYBE_PACK_FOR_GEMM=*/0x00000080); if (status != xnn_status_success) { std::cerr << "failed to create create convert " << std::endl; @@ -335,7 +335,7 @@ xnn_subgraph_t QD8Attention(size_t batch_size, size_t seq_len, } status = - xnn_define_convert(subgraph, outcome_reshaped_id, quantized_outcome_id, + xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, outcome_reshaped_id, quantized_outcome_id, /*XNN_FLAG_MAYBE_PACK_FOR_GEMM=*/0x00000080); if (status != xnn_status_success) { std::cerr << "failed to create create convert " << std::endl; diff --git a/bench/models/qs8-mobilenet-v2.cc b/bench/models/qs8-mobilenet-v2.cc index ae2aac572402..6de769b31a80 100644 --- a/bench/models/qs8-mobilenet-v2.cc +++ b/bench/models/qs8-mobilenet-v2.cc @@ -3086,8 +3086,10 @@ xnn_subgraph_t QS8MobileNetV2() { std::generate(w171_data.begin(), w171_data.end(), std::ref(qs8rng)); std::generate(w172_data.begin(), w172_data.end(), std::ref(qs32rng)); - status = xnn_define_convert( + status = xnn_define_unary( subgraph, + xnn_unary_convert, + /*params=*/nullptr, v0, v1, 0); @@ -4284,8 +4286,10 @@ xnn_subgraph_t QS8MobileNetV2() { return nullptr; } - status = xnn_define_convert( + status = xnn_define_unary( subgraph, + xnn_unary_convert, + /*params=*/nullptr, v65, v66, 0); diff --git a/bench/negate.cc b/bench/negate.cc index 4ca421d48872..caa5603b9231 100644 --- a/bench/negate.cc +++ b/bench/negate.cc @@ -13,15 +13,11 @@ #endif // BENCHMARK_TENSORFLOW_LITE static void xnnpack_negate_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_negate_nc_f16, - xnn_reshape_negate_nc_f16, - xnn_setup_negate_nc_f16, state); + benchmark_unary_operator(state, xnn_unary_negate); } static void xnnpack_negate_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_negate_nc_f32, - xnn_reshape_negate_nc_f32, - xnn_setup_negate_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_negate); } BENCHMARK(xnnpack_negate_f16) diff --git a/bench/qs16-qs8-vcvt.cc b/bench/qs16-qs8-vcvt.cc deleted file mode 100644 index 4ea6126a68d0..000000000000 --- a/bench/qs16-qs8-vcvt.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void qs16_qs8_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs16_qs8_vcvt_ukernel_fn cvt, - xnn_init_qs16_qs8_cvt_params_fn init_params) -{ - xnn_qs16_qs8_cvt_params params; - init_params(¶ms, 1.25f /* scale */, 1 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(qs16_qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qs8-f16-vcvt.cc b/bench/qs8-f16-vcvt.cc deleted file mode 100644 index bcf73e25966a..000000000000 --- a/bench/qs8-f16-vcvt.cc +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/microparams.h" -#include "xnnpack/vcvt.h" -#include - -static void qs8_f16_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs8_f16_vcvt_ukernel_fn cvt, - xnn_init_qs8_f16_cvt_params_fn init_params) -{ - xnn_qs8_f16_cvt_params params; - init_params(¶ms, - 0.25f /* scale */, - 1 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(qs8_f16_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "qs8-f16-vcvt/qs8-f16-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qs8-f32-vcvt.cc b/bench/qs8-f32-vcvt.cc deleted file mode 100644 index 5dc917aa950f..000000000000 --- a/bench/qs8-f32-vcvt.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void qs8_f32_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs8_f32_vcvt_ukernel_fn cvt, - xnn_init_qs8_f32_cvt_params_fn init_params) -{ - xnn_qs8_f32_cvt_params params; - init_params(¶ms, - 0.25f /* scale */, - 1 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(qs8_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "qs8-f32-vcvt/qs8-f32-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qs8-vcvt.cc b/bench/qs8-vcvt.cc deleted file mode 100644 index 765c08b107bb..000000000000 --- a/bench/qs8-vcvt.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void qs8_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_qs8_vcvt_ukernel_fn cvt, - xnn_init_qs8_cvt_params_fn init_params) -{ - xnn_qs8_cvt_params params; - init_params(¶ms, 1.25f /* scale */, -1 /* input zero point */, 1 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(qs8_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "qs8-vcvt/qs8-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-f32-vcvt.cc b/bench/qu8-f32-vcvt.cc deleted file mode 100644 index f90bf47d2038..000000000000 --- a/bench/qu8-f32-vcvt.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void qu8_f32_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_qu8_f32_vcvt_ukernel_fn cvt, - xnn_init_qu8_f32_cvt_params_fn init_params) -{ - xnn_qu8_f32_cvt_params params; - init_params(¶ms, - 0.25f /* scale */, - 127 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(qu8_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "qu8-f32-vcvt/qu8-f32-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/qu8-vcvt.cc b/bench/qu8-vcvt.cc deleted file mode 100644 index f43be113cd2b..000000000000 --- a/bench/qu8-vcvt.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void qu8_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_qu8_vcvt_ukernel_fn cvt, - xnn_init_qu8_cvt_params_fn init_params) -{ - xnn_qu8_cvt_params params; - init_params(¶ms, 1.25f /* scale */, 127 /* input zero point */, 129 /* output zero point */); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(qu8_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "qu8-vcvt/qu8-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/reciprocal-square-root.cc b/bench/reciprocal-square-root.cc index 2ce8ebe1003e..c04e420821ec 100644 --- a/bench/reciprocal-square-root.cc +++ b/bench/reciprocal-square-root.cc @@ -14,9 +14,7 @@ static void xnnpack_reciprocal_square_root_f32(benchmark::State& state) { benchmark_unary_operator( - xnn_create_reciprocal_square_root_nc_f32, - xnn_reshape_reciprocal_square_root_nc_f32, - xnn_setup_reciprocal_square_root_nc_f32, state); + state, xnn_unary_reciprocal_square_root); } BENCHMARK(xnnpack_reciprocal_square_root_f32) diff --git a/bench/s32-f32-vcvt.cc b/bench/s32-f32-vcvt.cc deleted file mode 100644 index a90c238e95f9..000000000000 --- a/bench/s32-f32-vcvt.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void s32_f32_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_s32_f32_vcvt_ukernel_fn cvt, - xnn_init_s32_f32_cvt_params_fn init_params) -{ - xnn_s32_f32_cvt_params params; - init_params(¶ms, /*zero_point=*/0); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(s32_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "s32-f32-vcvt/s32-f32-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/sigmoid.cc b/bench/sigmoid.cc index 1c34fcdb52e8..4368b88da816 100644 --- a/bench/sigmoid.cc +++ b/bench/sigmoid.cc @@ -18,39 +18,21 @@ static void xnnpack_sigmoid_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_sigmoid_nc_f16, - xnn_reshape_sigmoid_nc_f16, - xnn_setup_sigmoid_nc_f16, state); + benchmark_unary_operator(state, xnn_unary_sigmoid); } static void xnnpack_sigmoid_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_sigmoid_nc_f32, - xnn_reshape_sigmoid_nc_f32, - xnn_setup_sigmoid_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_sigmoid); } static void xnnpack_sigmoid_qs8(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_sigmoid_nc_qs8( - 1 /* input zero point */, 1.0f /* input scale */, - -128 /* output zero point */, 1.0f / 256.0f /* output scale */, - std::numeric_limits::min() /* output min */, - std::numeric_limits::max() /* output max */, flags, op); - }, - xnn_reshape_sigmoid_nc_qs8, xnn_setup_sigmoid_nc_qs8, state); + state, xnn_unary_sigmoid, nullptr, {1, 1.0f}, {-128, 1.0f / 256.0f}); } static void xnnpack_sigmoid_qu8(benchmark::State& state) { - benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_sigmoid_nc_qu8( - 128 /* input zero point */, 1.0f /* input scale */, - 0 /* output zero point */, 1.0f / 256.0f /* output scale */, - std::numeric_limits::min() /* output min */, - std::numeric_limits::max() /* output max */, flags, op); - }, - xnn_reshape_sigmoid_nc_qu8, xnn_setup_sigmoid_nc_qu8, state); + benchmark_unary_operator( + state, xnn_unary_sigmoid, nullptr, {128, 1.0f}, {0, 1.0f / 256.0f}); } BENCHMARK(xnnpack_sigmoid_f16) diff --git a/bench/square-root.cc b/bench/square-root.cc index e7d56c97a060..522adb8ddc34 100644 --- a/bench/square-root.cc +++ b/bench/square-root.cc @@ -14,14 +14,11 @@ static void xnnpack_square_root_f16(benchmark::State& state) { benchmark_unary_operator( - xnn_create_square_root_nc_f16, xnn_reshape_square_root_nc_f16, - xnn_setup_square_root_nc_f16, state); + state, xnn_unary_square_root); } static void xnnpack_square_root_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_square_root_nc_f32, - xnn_reshape_square_root_nc_f32, - xnn_setup_square_root_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_square_root); } BENCHMARK(xnnpack_square_root_f16) diff --git a/bench/square.cc b/bench/square.cc index 17d1c4fbc90a..5be549fe4d07 100644 --- a/bench/square.cc +++ b/bench/square.cc @@ -14,15 +14,11 @@ static void xnnpack_square_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_square_nc_f16, - xnn_reshape_square_nc_f16, - xnn_setup_square_nc_f16, state); + benchmark_unary_operator(state, xnn_unary_square); } static void xnnpack_square_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_square_nc_f32, - xnn_reshape_square_nc_f32, - xnn_setup_square_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_square); } BENCHMARK(xnnpack_square_f16) diff --git a/bench/tanh.cc b/bench/tanh.cc index 02b8f8113b02..c6f385c42c48 100644 --- a/bench/tanh.cc +++ b/bench/tanh.cc @@ -18,39 +18,21 @@ static void xnnpack_tanh_f16(benchmark::State& state) { - benchmark_unary_operator(xnn_create_tanh_nc_f16, - xnn_reshape_tanh_nc_f16, - xnn_setup_tanh_nc_f16, state); + benchmark_unary_operator(state, xnn_unary_tanh); } static void xnnpack_tanh_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_tanh_nc_f32, - xnn_reshape_tanh_nc_f32, - xnn_setup_tanh_nc_f32, state); + benchmark_unary_operator(state, xnn_unary_tanh); } static void xnnpack_tanh_qs8(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_tanh_nc_qs8( - 1 /* input zero point */, 1.0f /* input scale */, - 0 /* output zero point */, 1.0f / 128.0f /* output scale */, - std::numeric_limits::min() /* output min */, - std::numeric_limits::max() /* output max */, flags, op); - }, - xnn_reshape_tanh_nc_qs8, xnn_setup_tanh_nc_qs8, state); + state, xnn_unary_tanh, nullptr, {1, 1.0f}, {0, 1.0f / 128.0f}); } static void xnnpack_tanh_qu8(benchmark::State& state) { benchmark_unary_operator( - [](uint32_t flags, xnn_operator_t* op) { - return xnn_create_tanh_nc_qu8( - 128 /* input zero point */, 1.0f /* input scale */, - 128 /* output zero point */, 1.0f / 128.0f /* output scale */, - std::numeric_limits::min() /* output min */, - std::numeric_limits::max() /* output max */, flags, op); - }, - xnn_reshape_tanh_nc_qu8, xnn_setup_tanh_nc_qu8, state); + state, xnn_unary_tanh, nullptr, {128, 1.0f}, {128, 1.0f / 128.0f}); } BENCHMARK(xnnpack_tanh_f16) diff --git a/bench/truncation.cc b/bench/truncation.cc deleted file mode 100644 index 4b5b3d780d99..000000000000 --- a/bench/truncation.cc +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "xnnpack.h" - -#include "unary_operator.h" -#include "utils.h" -#include -#ifdef BENCHMARK_TENSORFLOW_LITE -#include "tensorflow/lite/schema/schema_generated.h" -#endif // BENCHMARK_TENSORFLOW_LITE - -static void xnnpack_truncation_f32(benchmark::State& state) { - benchmark_unary_operator(xnn_create_truncation_nc_f32, - xnn_reshape_truncation_nc_f32, - xnn_setup_truncation_nc_f32, state); -} - -BENCHMARK(xnnpack_truncation_f32) - ->Apply(benchmark::utils::UnaryElementwiseParameters) - ->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/u32-f32-vcvt.cc b/bench/u32-f32-vcvt.cc deleted file mode 100644 index 4f570cf1cad1..000000000000 --- a/bench/u32-f32-vcvt.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/u32-f32-vcvt.yaml -// Generator: tools/generate-vcvt-test.py - - -#include -#include "utils.h" -#include "vcvt-benchmark.h" -#include "xnnpack.h" -#include "xnnpack/hardware-config.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/microparams-init.h" -#include "xnnpack/vcvt.h" - -static void u32_f32_vcvt( - benchmark::State& state, - uint64_t arch_flags, - xnn_u32_f32_vcvt_ukernel_fn cvt, - xnn_init_u32_f32_cvt_params_fn init_params) -{ - xnn_u32_f32_cvt_params params; - init_params(¶ms, /*zero_point=*/0); - - cvt_benchmark(state, arch_flags, cvt, ¶ms); -} - -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params)\ -BENCHMARK_CAPTURE(u32_f32_vcvt, ukernel, arch_flags, ukernel, init_params) \ - ->Apply(benchmark::utils::UnaryElementwiseParameters) \ - ->UseRealTime(); -#include "u32-f32-vcvt/u32-f32-vcvt.h" -#undef XNN_CVT_UKERNEL_WITH_PARAMS - - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/bench/unary_operator.h b/bench/unary_operator.h index 2ad458acdbed..c3e0846e688c 100644 --- a/bench/unary_operator.h +++ b/bench/unary_operator.h @@ -31,10 +31,12 @@ #include "tensorflow/lite/version.h" #endif // BENCHMARK_TENSORFLOW_LITE -template -static void benchmark_unary_operator(Create create, Reshape reshape, - Setup setup, benchmark::State& state) { +template +static void benchmark_unary_operator( + benchmark::State& state, xnn_unary_operator op_type, + const xnn_unary_params* params = nullptr, + const xnn_quantization_params& input_quantization = {0, 1.0f}, + const xnn_quantization_params& output_quantization = {0, 1.0f}) { const size_t batch_size = state.range(0); std::random_device random_device; @@ -52,21 +54,24 @@ static void benchmark_unary_operator(Create create, Reshape reshape, } xnn_operator_t op = nullptr; - status = create(0 /* flags */, &op); + status = xnn_create_unary_elementwise_nc( + op_type, xnnpack::datatype_of(), xnnpack::datatype_of(), params, + &input_quantization, &output_quantization, 0 /* flags */, &op); if (status != xnn_status_success || op == nullptr) { state.SkipWithError("failed to create Abs operator"); return; } - status = reshape(op, batch_size, - /*channels=*/1, /*input_stride=*/1, /*output_stride=*/1, - /*threadpool=*/nullptr); + status = xnn_reshape_unary_elementwise_nc(op, batch_size, + /*channels=*/1, /*input_stride=*/1, + /*output_stride=*/1, + /*threadpool=*/nullptr); if (status != xnn_status_success) { state.SkipWithError("failed to reshape Abs operator"); return; } - status = setup(op, input.data(), output.data()); + status = xnn_setup_unary_elementwise_nc(op, input.data(), output.data()); if (status != xnn_status_success) { state.SkipWithError("failed to setup Abs operator"); return; diff --git a/bench/vcvt-benchmark.h b/bench/vcvt-benchmark.h deleted file mode 100644 index a97a235fc66e..000000000000 --- a/bench/vcvt-benchmark.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "utils.h" -#include "xnnpack.h" -#include "xnnpack/buffer.h" -#include - -namespace { - -template -void cvt_benchmark( - benchmark::State& state, - uint64_t arch_flags, - UKernelFn cvt, - const Params* params) -{ - if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { - return; - } - - const size_t num_elements = state.range(0); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(0.0f, 10.0f), std::ref(rng)); - - xnnpack::Buffer x(num_elements + - XNN_EXTRA_BYTES / sizeof(In)); - xnnpack::Buffer y(num_elements); - std::generate(x.begin(), x.end(), f32rng); - - for (auto _ : state) { - cvt(num_elements * sizeof(In), x.data(), y.data(), params); - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t elements_per_iteration = num_elements; - state.counters["elements"] = - benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = num_elements * (sizeof(In) + sizeof(Out)); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -template -void cvt_benchmark( - benchmark::State& state, - uint64_t arch_flags, - UKernelFn cvt, - std::nullptr_t) { - cvt_benchmark(state, arch_flags, cvt, nullptr); -} - -} // namespace diff --git a/bench/vunary.cc b/bench/vunary.cc index 30ef008f441e..0e336c0cf1e8 100644 --- a/bench/vunary.cc +++ b/bench/vunary.cc @@ -23,6 +23,7 @@ #include "xnnpack/microfnptr.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" +#include "xnnpack/vcvt.h" #include "xnnpack/vunary.h" #include @@ -48,7 +49,7 @@ struct UniformDistribution { template <> struct UniformDistribution { - std::uniform_int_distribution dist{ + std::uniform_int_distribution dist{ std::numeric_limits::lowest(), std::numeric_limits::max()}; @@ -60,7 +61,7 @@ struct UniformDistribution { template <> struct UniformDistribution { - std::uniform_int_distribution dist{ + std::uniform_int_distribution dist{ std::numeric_limits::lowest(), std::numeric_limits::max()}; @@ -70,81 +71,54 @@ struct UniformDistribution { } }; -template -T make_params(InitFn init_fn, Args... args) { - T result; - init_fn(&result, args...); - return result; -} - -template -struct Config { - Params params; -}; - -template <> -struct Config { - xnn_f16_minmax_params params = {{-1.0f, 1.0f}}; -}; - -template <> -struct Config { - xnn_f32_minmax_params params = {{-1.0f, 1.0f}}; -}; - template <> -struct Config { - xnn_f16_elu_params params = {{1.0f, 1.0f, 1.0f}}; -}; +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::lowest(), + std::numeric_limits::max()}; -template <> -struct Config { - xnn_f32_elu_params params = {{1.0f, 1.0f, 1.0f}}; -}; - -template <> -struct Config { - xnn_f16_lrelu_params params = {{0.01f}}; -}; - -template <> -struct Config { - xnn_f32_lrelu_params params = {{0.01f}}; -}; - -template <> -struct Config { - xnn_s8_minmax_params params = {{-100, 100}}; + template + int16_t operator()(Generator& g) { + return dist(g); + } }; template <> -struct Config { - xnn_u8_minmax_params params = {{0, 200}}; -}; +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::lowest(), + std::numeric_limits::max()}; -template <> -struct Config { - xnn_qs8_lrelu_params params = make_params( - xnn_init_qs8_lrelu_scalar_params, 0.1f, 1.0f, 1, 1); + template + int32_t operator()(Generator& g) { + return dist(g); + } }; template <> -struct Config { - xnn_qu8_lrelu_params params = make_params( - xnn_init_qu8_lrelu_scalar_params, 0.1f, 1.0f, 1, 1); -}; +struct UniformDistribution { + std::uniform_int_distribution dist{ + std::numeric_limits::lowest(), + std::numeric_limits::max()}; -template <> -struct Config { - xnn_qs8_hswish_params params = make_params( - xnn_init_qs8_hswish_scalar_params, 0, 0, 1.0f, 1.0f); + template + uint32_t operator()(Generator& g) { + return dist(g); + } }; -template <> -struct Config { - xnn_qu8_hswish_params params = make_params( - xnn_init_qu8_hswish_scalar_params, 0, 0, 1.0f, 1.0f); -}; +template +xnn_quantization_params InputQuantization(T) { + return {0, 1.0f}; +} +template +xnn_quantization_params OutputQuantization(T) { + return {0, 1.0f}; +} +xnn_quantization_params InputQuantization(int8_t) { return {1, 0.5f}; } +xnn_quantization_params OutputQuantization(int8_t) { return {-1, 0.7f}; } +xnn_quantization_params InputQuantization(uint8_t) { return {129, 0.5f}; } +xnn_quantization_params OutputQuantization(uint8_t) { return {127, 0.7f}; } // Microkernel function, templated on the `params` type. template @@ -153,25 +127,36 @@ using UKernelFn = void (*)(size_t, const TIn*, TOut*, template void vunary(benchmark::State& state, uint64_t arch_flags, - UKernelFn ukernel) { + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, + const xnn_unary_params* params = nullptr, + const xnn_quantization_params& input_quantization = + InputQuantization(TIn()), + const xnn_quantization_params& output_quantization = + OutputQuantization(TOut())) { if (!benchmark::utils::CheckArchFlags(state, arch_flags)) { return; } const size_t num_elements = state.range(0); - Config config; + xnn_unary_uparams uparams; + if (init_params) { + init_params(&uparams, params, &input_quantization, &output_quantization); + } std::random_device random_device; auto rng = std::mt19937(random_device()); UniformDistribution dist; - xnnpack::Buffer x(num_elements); + xnnpack::Buffer x( + num_elements + XNN_EXTRA_BYTES / sizeof(TIn)); xnnpack::Buffer y(num_elements); std::generate(x.begin(), x.end(), [&]() { return dist(rng); }); for (auto _ : state) { - ukernel(num_elements * sizeof(TOut), x.data(), y.data(), &config.params); + ukernel(num_elements * sizeof(TIn), x.data(), y.data(), + (UKernelParams*)&uparams); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); @@ -193,15 +178,12 @@ void vunary(benchmark::State& state, uint64_t arch_flags, #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ datatype, params_type, init_params) \ - BENCHMARK_CAPTURE(vunary, ukernel, arch_flags, ukernel) \ + BENCHMARK_CAPTURE(vunary, ukernel, arch_flags, ukernel, init_params) \ ->Apply( \ benchmark::utils::UnaryElementwiseParameters) \ ->UseRealTime(); #include "f16-vabs/f16-vabs.h" -#include "f16-vclamp/f16-vclamp.h" -#include "f16-velu/f16-velu.h" #include "f16-vhswish/f16-vhswish.h" -#include "f16-vlrelu/f16-vlrelu.h" #include "f16-vneg/f16-vneg.h" #include "f16-vrnd/f16-vrndd.h" #include "f16-vrnd/f16-vrndne.h" @@ -213,12 +195,9 @@ void vunary(benchmark::State& state, uint64_t arch_flags, #include "f16-vsqrt/f16-vsqrt.h" #include "f16-vtanh/f16-vtanh.h" #include "f32-vabs/f32-vabs.h" -#include "f32-vclamp/f32-vclamp.h" -#include "f32-velu/f32-velu.h" #include "f32-vgelu/f32-vgelu.h" #include "f32-vhswish/f32-vhswish.h" #include "f32-vlog/f32-vlog.h" -#include "f32-vlrelu/f32-vlrelu.h" #include "f32-vneg/f32-vneg.h" #include "f32-vrelu/f32-vrelu.h" #include "f32-vrnd/f32-vrndd.h" @@ -231,13 +210,97 @@ void vunary(benchmark::State& state, uint64_t arch_flags, #include "f32-vsqrt/f32-vsqrt.h" #include "f32-vtanh/f32-vtanh.h" #include "qs8-vhswish/qs8-vhswish.h" -#include "qs8-vlrelu/qs8-vlrelu.h" #include "qu8-vhswish/qu8-vhswish.h" -#include "qu8-vlrelu/qu8-vlrelu.h" +#undef XNN_UKERNEL_WITH_PARAMS + +template +void velu(benchmark::State& state, uint64_t arch_flags, + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params) { + xnn_unary_params params; + params.elu.alpha = 1.0f; + vunary(state, arch_flags, ukernel, init_params, ¶ms); +} + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype, params_type, init_params) \ + BENCHMARK_CAPTURE(velu, ukernel, arch_flags, ukernel, init_params) \ + ->Apply( \ + benchmark::utils::UnaryElementwiseParameters) \ + ->UseRealTime(); +#include "f16-velu/f16-velu.h" +#include "f32-velu/f32-velu.h" +#undef XNN_UKERNEL_WITH_PARAMS + +template +void vclamp(benchmark::State& state, uint64_t arch_flags, + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params) { + xnn_unary_params params; + params.clamp.min = -1.0f; + params.clamp.max = 1.0f; + // These kernels cannot handle changing quantization parameters. + xnn_quantization_params input_quantization = {0, 1.0f}; + xnn_quantization_params output_quantization = {0, 1.0f}; + vunary(state, arch_flags, ukernel, init_params, ¶ms, input_quantization, + output_quantization); +} + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype, params_type, init_params) \ + BENCHMARK_CAPTURE(vclamp, ukernel, arch_flags, ukernel, init_params) \ + ->Apply( \ + benchmark::utils::UnaryElementwiseParameters) \ + ->UseRealTime(); +#include "f16-vclamp/f16-vclamp.h" +#include "f32-vclamp/f32-vclamp.h" #include "s8-vclamp/s8-vclamp.h" #include "u8-vclamp/u8-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS +template +void vlrelu(benchmark::State& state, uint64_t arch_flags, + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params) { + xnn_unary_params params; + params.leaky_relu.negative_slope = 0.5f; + vunary(state, arch_flags, ukernel, init_params, ¶ms); +} + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype, params_type, init_params) \ + BENCHMARK_CAPTURE(vlrelu, ukernel, arch_flags, ukernel, init_params) \ + ->Apply( \ + benchmark::utils::UnaryElementwiseParameters) \ + ->UseRealTime(); +#include "f16-vlrelu/f16-vlrelu.h" +#include "f32-vlrelu/f32-vlrelu.h" +#include "qs8-vlrelu/qs8-vlrelu.h" +#include "qu8-vlrelu/qu8-vlrelu.h" +#undef XNN_UKERNEL_WITH_PARAMS + +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, \ + vector_tile, datatype_in, datatype_out, \ + params_type, init_params) \ + BENCHMARK_CAPTURE(vunary, ukernel, arch_flags, ukernel, init_params) \ + ->Apply(benchmark::utils::UnaryElementwiseParameters) \ + ->UseRealTime(); +#include "f16-f32-vcvt/f16-f32-vcvt.h" +#include "f16-qs8-vcvt/f16-qs8-vcvt.h" +#include "f32-f16-vcvt/f32-f16-vcvt.h" +#include "f32-qs8-vcvt/f32-qs8-vcvt.h" +#include "f32-qu8-vcvt/f32-qu8-vcvt.h" +#include "qs16-qs8-vcvt/qs16-qs8-vcvt.h" +#include "qs8-f16-vcvt/qs8-f16-vcvt.h" +#include "qs8-f32-vcvt/qs8-f32-vcvt.h" +#include "qs8-vcvt/qs8-vcvt.h" +#include "qu8-f32-vcvt/qu8-f32-vcvt.h" +#include "qu8-vcvt/qu8-vcvt.h" +#include "s32-f32-vcvt/s32-f32-vcvt.h" +#include "u32-f32-vcvt/u32-f32-vcvt.h" +#undef XNN_CVT_UKERNEL_WITH_PARAMS + #ifndef XNNPACK_BENCHMARK_NO_MAIN BENCHMARK_MAIN(); #endif diff --git a/build_srcs.bzl b/build_srcs.bzl index 407d13b01863..bd75d4b55b4d 100644 --- a/build_srcs.bzl +++ b/build_srcs.bzl @@ -20,7 +20,6 @@ OPERATOR_SRCS = [ "src/operators/deconvolution-nhwc.c", "src/operators/dynamic-fully-connected-nc.c", "src/operators/fully-connected-nc.c", - "src/operators/lut-elementwise-nc.c", "src/operators/max-pooling-nhwc.c", "src/operators/pack-lh.c", "src/operators/reduce-nd.c", @@ -39,50 +38,33 @@ SUBGRAPH_SRCS = [ "src/memory-planner.c", "src/runtime.c", "src/subgraph.c", - "src/subgraph/abs.c", "src/subgraph/argmax-pooling-2d.c", "src/subgraph/average-pooling-2d.c", - "src/subgraph/bankers-rounding.c", "src/subgraph/batch-matrix-multiply.c", "src/subgraph/binary.c", - "src/subgraph/ceiling.c", - "src/subgraph/clamp.c", "src/subgraph/concatenate.c", - "src/subgraph/convert.c", "src/subgraph/convolution-2d.c", "src/subgraph/copy.c", "src/subgraph/deconvolution-2d.c", "src/subgraph/deprecated.c", "src/subgraph/depth-to-space-2d.c", "src/subgraph/depthwise-convolution-2d.c", - "src/subgraph/elu.c", "src/subgraph/even-split.c", - "src/subgraph/exp.c", - "src/subgraph/floor.c", "src/subgraph/fully-connected-sparse.c", "src/subgraph/fully-connected.c", - "src/subgraph/gelu.c", - "src/subgraph/hardswish.c", - "src/subgraph/leaky-relu.c", - "src/subgraph/log.c", "src/subgraph/max-pooling-2d.c", - "src/subgraph/negate.c", "src/subgraph/pack-lh.c", - "src/subgraph/reciprocal-square-root.c", "src/subgraph/reshape-helpers.c", "src/subgraph/rope.c", "src/subgraph/scaled-dot-product-attention.c", - "src/subgraph/sigmoid.c", "src/subgraph/softmax.c", "src/subgraph/space-to-depth-2d.c", - "src/subgraph/square-root.c", - "src/subgraph/square.c", "src/subgraph/static-constant-pad.c", "src/subgraph/static-reduce.c", "src/subgraph/static-resize-bilinear-2d.c", "src/subgraph/static-slice.c", "src/subgraph/static-transpose.c", - "src/subgraph/tanh.c", + "src/subgraph/unary.c", "src/subgraph/unpooling-2d.c", "src/subgraph/validation.c", "src/tensor.c", diff --git a/include/xnnpack.h b/include/xnnpack.h index 7d32db7ce00e..0a3ae6a47084 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -31,6 +31,9 @@ extern "C" { /// Maximum number of dimensions in tensor shape. #define XNN_MAX_TENSOR_DIMS 6 +/// A value ID that cannot be valid. +#define XNN_INVALID_VALUE_ID UINT32_MAX + /// Allow sparse inference in a Runtime. /// /// Note: this flag is a hint to XNNPACK that it should consider sparse inference, but does not guarantee it. @@ -471,6 +474,61 @@ enum xnn_status xnn_define_dynamically_quantized_tensor_value( uint32_t flags, uint32_t* id_out); +/// Type of unary operation +enum xnn_unary_operator { + xnn_unary_invalid = -1, + xnn_unary_convert, + xnn_unary_clamp, + xnn_unary_abs, + xnn_unary_bankers_rounding, + xnn_unary_ceiling, + xnn_unary_elu, + xnn_unary_exp, + xnn_unary_floor, + xnn_unary_gelu, + xnn_unary_hardswish, + xnn_unary_leaky_relu, + xnn_unary_log, + xnn_unary_negate, + xnn_unary_sigmoid, + xnn_unary_square, + xnn_unary_square_root, + xnn_unary_reciprocal_square_root, + xnn_unary_tanh, +}; + +/// Parameters for use by xnn_unary_operator_clamp +union xnn_unary_params { + struct { + // TODO: Should be double, to exactly represent int32? + float min; + float max; + } clamp; + struct { + float alpha; + } elu; + struct { + float negative_slope; + } leaky_relu; +}; + +/// Define a unary operator Node and add it to a Subgraph. +/// +/// @param subgraph - a Subgraph object that will own the created Node. +/// @param operator - type of unary operator to define. +/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph. +/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its +/// shape must match the shape of the input tensor. +/// @param params - parameters to be interpreted by the specific operator type. +/// @param flags - binary features of the Node. No supported flags are currently defined. +enum xnn_status xnn_define_unary( + xnn_subgraph_t subgraph, + enum xnn_unary_operator type, + const union xnn_unary_params* params, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); + /// Define a Convert Node and add it to a Subgraph. /// /// @param subgraph - a Subgraph object that will own the created Node. @@ -478,7 +536,7 @@ enum xnn_status xnn_define_dynamically_quantized_tensor_value( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Convert Node. No supported flags are currently defined. -enum xnn_status xnn_define_convert( +XNN_DEPRECATED enum xnn_status xnn_define_convert( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1686,7 +1744,7 @@ enum xnn_status xnn_define_rope( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Abs Node. No supported flags are currently defined. -enum xnn_status xnn_define_abs( +XNN_DEPRECATED enum xnn_status xnn_define_abs( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1699,7 +1757,7 @@ enum xnn_status xnn_define_abs( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined. -enum xnn_status xnn_define_bankers_rounding( +XNN_DEPRECATED enum xnn_status xnn_define_bankers_rounding( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1739,7 +1797,7 @@ enum xnn_status xnn_define_batch_matrix_multiply( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Ceiling Node. No supported flags are currently defined. -enum xnn_status xnn_define_ceiling( +XNN_DEPRECATED enum xnn_status xnn_define_ceiling( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1754,7 +1812,7 @@ enum xnn_status xnn_define_ceiling( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Clamp Node. No supported flags are currently defined. -enum xnn_status xnn_define_clamp( +XNN_DEPRECATED enum xnn_status xnn_define_clamp( xnn_subgraph_t subgraph, float output_min, float output_max, @@ -1770,7 +1828,7 @@ enum xnn_status xnn_define_clamp( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the ELU Node. No supported flags are currently defined. -enum xnn_status xnn_define_elu( +XNN_DEPRECATED enum xnn_status xnn_define_elu( xnn_subgraph_t subgraph, float alpha, uint32_t input_id, @@ -1784,7 +1842,7 @@ enum xnn_status xnn_define_elu( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Exp Node. No supported flags are currently defined. -enum xnn_status xnn_define_exp( +XNN_DEPRECATED enum xnn_status xnn_define_exp( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1797,7 +1855,7 @@ enum xnn_status xnn_define_exp( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Floor Node. No supported flags are currently defined. -enum xnn_status xnn_define_floor( +XNN_DEPRECATED enum xnn_status xnn_define_floor( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1810,7 +1868,7 @@ enum xnn_status xnn_define_floor( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the GELU Node. No supported flags are currently defined. -enum xnn_status xnn_define_gelu( +XNN_DEPRECATED enum xnn_status xnn_define_gelu( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1823,7 +1881,7 @@ enum xnn_status xnn_define_gelu( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the HardSwish Node. No supported flags are currently defined. -enum xnn_status xnn_define_hardswish( +XNN_DEPRECATED enum xnn_status xnn_define_hardswish( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1837,7 +1895,7 @@ enum xnn_status xnn_define_hardswish( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined. -enum xnn_status xnn_define_leaky_relu( +XNN_DEPRECATED enum xnn_status xnn_define_leaky_relu( xnn_subgraph_t subgraph, float negative_slope, uint32_t input_id, @@ -1851,7 +1909,7 @@ enum xnn_status xnn_define_leaky_relu( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Log Node. No supported flags are currently defined. -enum xnn_status xnn_define_log( +XNN_DEPRECATED enum xnn_status xnn_define_log( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1864,7 +1922,7 @@ enum xnn_status xnn_define_log( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Negate Node. No supported flags are currently defined. -enum xnn_status xnn_define_negate( +XNN_DEPRECATED enum xnn_status xnn_define_negate( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1877,7 +1935,7 @@ enum xnn_status xnn_define_negate( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined. -enum xnn_status xnn_define_sigmoid( +XNN_DEPRECATED enum xnn_status xnn_define_sigmoid( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1925,7 +1983,7 @@ enum xnn_status xnn_define_space_to_depth_2d( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Square Node. No supported flags are currently defined. -enum xnn_status xnn_define_square( +XNN_DEPRECATED enum xnn_status xnn_define_square( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1938,7 +1996,7 @@ enum xnn_status xnn_define_square( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Square Root Node. No supported flags are currently defined. -enum xnn_status xnn_define_square_root( +XNN_DEPRECATED enum xnn_status xnn_define_square_root( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -1954,10 +2012,11 @@ enum xnn_status xnn_define_square_root( /// shape must match the shape of the input tensor. /// @param flags - binary features of the Square Root Node. No supported flags /// are currently defined. -enum xnn_status xnn_define_reciprocal_square_root(xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags); +XNN_DEPRECATED enum xnn_status xnn_define_reciprocal_square_root( + xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags); /// Define a Static Slice Node add it to a Subgraph. /// @@ -2006,7 +2065,7 @@ enum xnn_status xnn_define_static_transpose( /// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its /// shape must match the shape of the input tensor. /// @param flags - binary features of the Tanh Node. No supported flags are currently defined. -enum xnn_status xnn_define_tanh( +XNN_DEPRECATED enum xnn_status xnn_define_tanh( xnn_subgraph_t subgraph, uint32_t input_id, uint32_t output_id, @@ -2330,49 +2389,47 @@ enum xnn_status xnn_run_binary_elementwise_nd( void* output, pthreadpool_t threadpool); -enum xnn_status xnn_create_abs_nc_f16( +enum xnn_status xnn_create_unary_elementwise_nc( + enum xnn_unary_operator op_type, + enum xnn_datatype input_datatype, + enum xnn_datatype output_datatype, + const union xnn_unary_params* params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization, uint32_t flags, - xnn_operator_t* abs_op_out); + xnn_operator_t* op_out); -enum xnn_status xnn_reshape_abs_nc_f16( - xnn_operator_t abs_op, +enum xnn_status xnn_reshape_unary_elementwise_nc( + xnn_operator_t op, size_t batch_size, size_t channels, size_t input_stride, size_t output_stride, pthreadpool_t threadpool); -enum xnn_status xnn_setup_abs_nc_f16( - xnn_operator_t abs_op, +enum xnn_status xnn_setup_unary_elementwise_nc( + xnn_operator_t op, const void* input, void* output); -enum xnn_status xnn_create_abs_nc_f32( +enum xnn_status xnn_run_unary_elementwise_nc( + // create parameters + enum xnn_unary_operator op_type, + enum xnn_datatype input_datatype, + enum xnn_datatype output_datatype, + const union xnn_unary_params* params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization, uint32_t flags, - xnn_operator_t* abs_op_out); - -enum xnn_status xnn_reshape_abs_nc_f32( - xnn_operator_t abs_op, + // reshape parameters size_t batch_size, size_t channels, size_t input_stride, size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_abs_nc_f32( - xnn_operator_t abs_op, - const float* input, - float* output); - -enum xnn_status xnn_run_abs_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); + pthreadpool_t threadpool, + // setup parameters + const void* input, + void* output); enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32( uint32_t input_padding_top, @@ -2511,50 +2568,6 @@ enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8( const uint8_t* input, uint8_t* output); -enum xnn_status xnn_create_bankers_rounding_nc_f16( - uint32_t flags, - xnn_operator_t* rounding_op_out); - -enum xnn_status xnn_reshape_bankers_rounding_nc_f16( - xnn_operator_t rounding_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_bankers_rounding_nc_f16( - xnn_operator_t rounding_op, - const void* input, - void* output); - -enum xnn_status xnn_create_bankers_rounding_nc_f32( - uint32_t flags, - xnn_operator_t* rounding_op_out); - -enum xnn_status xnn_reshape_bankers_rounding_nc_f32( - xnn_operator_t rounding_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_bankers_rounding_nc_f32( - xnn_operator_t rounding_op, - const float* input, - float* output); - -enum xnn_status xnn_run_bankers_rounding_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_batch_matrix_multiply_nc_f16( uint32_t flags, xnn_operator_t* batch_matrix_multiply_op); @@ -2601,50 +2614,6 @@ enum xnn_status xnn_setup_batch_matrix_multiply_nc_qd8_f32_qc8w( const struct xnn_quantization_params* quantization_params, float* output); -enum xnn_status xnn_create_ceiling_nc_f16( - uint32_t flags, - xnn_operator_t* ceiling_op_out); - -enum xnn_status xnn_reshape_ceiling_nc_f16( - xnn_operator_t ceiling_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_ceiling_nc_f16( - xnn_operator_t ceiling_op, - const void* input, - void* output); - -enum xnn_status xnn_create_ceiling_nc_f32( - uint32_t flags, - xnn_operator_t* ceiling_op_out); - -enum xnn_status xnn_run_ceiling_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_reshape_ceiling_nc_f32( - xnn_operator_t ceiling_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_ceiling_nc_f32( - xnn_operator_t ceiling_op, - const float* input, - float* output); - enum xnn_status xnn_create_channel_shuffle_nc_x8( size_t groups, size_t group_channels, @@ -2681,94 +2650,6 @@ enum xnn_status xnn_setup_channel_shuffle_nc_x32( const void* input, void* output); -enum xnn_status xnn_create_clamp_nc_f16( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* clamp_op_out); - -enum xnn_status xnn_reshape_clamp_nc_f16( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_clamp_nc_f16( - xnn_operator_t clamp_op, - const void* input, - void* output); - -enum xnn_status xnn_create_clamp_nc_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* clamp_op_out); - -enum xnn_status xnn_reshape_clamp_nc_f32( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_clamp_nc_f32( - xnn_operator_t clamp_op, - const float* input, - float* output); - -enum xnn_status xnn_run_clamp_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_clamp_nc_s8( - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* clamp_op_out); - -enum xnn_status xnn_reshape_clamp_nc_s8( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_clamp_nc_s8( - xnn_operator_t clamp_op, - const int8_t* input, - int8_t* output); - -enum xnn_status xnn_create_clamp_nc_u8( - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* clamp_op_out); - -enum xnn_status xnn_reshape_clamp_nc_u8( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_clamp_nc_u8( - xnn_operator_t clamp_op, - const uint8_t* input, - uint8_t* output); - enum xnn_status xnn_create_constant_pad_nd_x8( const void* padding_value, uint32_t flags, @@ -2856,33 +2737,6 @@ enum xnn_status xnn_run_constant_pad_nd_x32( const void* padding_value, pthreadpool_t threadpool); -enum xnn_status xnn_create_convert_nc_f16_f32( - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_f16_f32( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_f16_f32( - xnn_operator_t convert_op, - const void* input, - float* output); - -enum xnn_status xnn_run_convert_nc_f16_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const void* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_convert_nc_f16_qd8( uint32_t flags, xnn_operator_t* convert_op_out); @@ -2921,310 +2775,66 @@ enum xnn_status xnn_setup_convert_nc_f32_qd8( int8_t* output, struct xnn_quantization_params* quantization_params); -enum xnn_status xnn_create_convert_nc_f32_f16( + +enum xnn_status xnn_create_convolution2d_nchw_f16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, uint32_t flags, - xnn_operator_t* convert_op_out); + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); -enum xnn_status xnn_reshape_convert_nc_f32_f16( - xnn_operator_t convert_op, +enum xnn_status xnn_reshape_convolution2d_nchw_f16( + xnn_operator_t convolution_op, size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, + size_t input_height, + size_t input_width, + size_t* output_height_out, + size_t* output_width_out, pthreadpool_t threadpool); -enum xnn_status xnn_setup_convert_nc_f32_f16( - xnn_operator_t convert_op, - const float* input, +enum xnn_status xnn_setup_convolution2d_nchw_f16( + xnn_operator_t convolution_op, + const void* input, void* output); -enum xnn_status xnn_run_convert_nc_f32_f16( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - void* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_convert_nc_f32_qs8( - float output_scale, - int8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_f32_qs8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_f32_qs8( - xnn_operator_t convert_op, - const float* input, - int8_t* output); - -enum xnn_status xnn_run_convert_nc_f32_qs8( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - int8_t* output, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_convert_nc_f32_qu8( - float output_scale, - uint8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_f32_qu8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_f32_qu8( - xnn_operator_t convert_op, - const float* input, - uint8_t* output); - -enum xnn_status xnn_run_convert_nc_f32_qu8( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - uint8_t* output, - float output_scale, - uint8_t output_zero_point, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_convert_nc_qs8( - float input_scale, - int8_t input_zero_point, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_qs8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_qs8( - xnn_operator_t convert_op, - const int8_t* input, - int8_t* output); - -enum xnn_status xnn_create_convert_nc_qs8_f16( - float input_scale, - int8_t input_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_qs8_f16( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_qs8_f16( - xnn_operator_t convert_op, - const int8_t* input, - void* output); - -enum xnn_status xnn_create_convert_nc_qs8_f32( - float input_scale, - int8_t input_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_qs8_f32( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_qs8_f32( - xnn_operator_t convert_op, - const int8_t* input, - float* output); - -enum xnn_status xnn_run_convert_nc_qs8_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const int8_t* input, - float* output, - float input_scale, - int8_t input_zero_point, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_convert_nc_qs16_qs8( - float input_scale, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_qs16_qs8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_qs16_qs8( - xnn_operator_t convert_op, - const int16_t* input, - int8_t* output); - -enum xnn_status xnn_run_convert_nc_qs16_qs8( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const int16_t* input, - int8_t* output, - float input_scale, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_convert_nc_qu8( - float input_scale, - uint8_t input_zero_point, - float output_scale, - uint8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_qu8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_qu8( - xnn_operator_t convert_op, - const uint8_t* input, - uint8_t* output); - -enum xnn_status xnn_create_convert_nc_qu8_f32( - float input_scale, - uint8_t input_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out); - -enum xnn_status xnn_reshape_convert_nc_qu8_f32( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convert_nc_qu8_f32( - xnn_operator_t convert_op, - const uint8_t* input, - float* output); - -enum xnn_status xnn_run_convert_nc_qu8_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const uint8_t* input, - float* output, - float input_scale, - uint8_t input_zero_point, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_convolution2d_nchw_f16( - uint32_t input_padding_top, - uint32_t input_padding_right, - uint32_t input_padding_bottom, - uint32_t input_padding_left, - uint32_t kernel_height, - uint32_t kernel_width, - uint32_t subsampling_height, - uint32_t subsampling_width, - uint32_t dilation_height, - uint32_t dilation_width, - uint32_t groups, - size_t group_input_channels, - size_t group_output_channels, - size_t input_channel_stride, - size_t output_channel_stride, - const void* kernel, - const void* bias, - float output_min, - float output_max, - uint32_t flags, - xnn_code_cache_t code_cache, - xnn_weights_cache_t weights_cache, - xnn_operator_t* convolution_op_out); - -enum xnn_status xnn_reshape_convolution2d_nchw_f16( - xnn_operator_t convolution_op, - size_t batch_size, - size_t input_height, - size_t input_width, - size_t* output_height_out, - size_t* output_width_out, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_convolution2d_nchw_f16( - xnn_operator_t convolution_op, - const void* input, - void* output); - -enum xnn_status xnn_create_convolution2d_nchw_f32( - uint32_t input_padding_top, - uint32_t input_padding_right, - uint32_t input_padding_bottom, - uint32_t input_padding_left, - uint32_t kernel_height, - uint32_t kernel_width, - uint32_t subsampling_height, - uint32_t subsampling_width, - uint32_t dilation_height, - uint32_t dilation_width, - uint32_t groups, - size_t group_input_channels, - size_t group_output_channels, - size_t input_channel_stride, - size_t output_channel_stride, - const float* kernel, - const float* bias, - float output_min, - float output_max, +enum xnn_status xnn_create_convolution2d_nchw_f32( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const float* kernel, + const float* bias, + float output_min, + float output_max, uint32_t flags, xnn_code_cache_t code_cache, xnn_weights_cache_t weights_cache, @@ -4025,141 +3635,9 @@ enum xnn_status xnn_setup_dynamic_fully_connected_nc_f32( const float* bias, float* output); -enum xnn_status xnn_create_elu_nc_f16( - float alpha, - uint32_t flags, - xnn_operator_t* elu_op_out); - -enum xnn_status xnn_reshape_elu_nc_f16( - xnn_operator_t elu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_elu_nc_f16( - xnn_operator_t elu_op, - const void* input, - void* output); - -enum xnn_status xnn_create_elu_nc_f32( - float alpha, - uint32_t flags, - xnn_operator_t* elu_op_out); - -enum xnn_status xnn_reshape_elu_nc_f32( - xnn_operator_t elu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_elu_nc_f32( - xnn_operator_t elu_op, - const float* input, - float* output); - -enum xnn_status xnn_run_elu_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - float alpha, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_elu_nc_qs8( - float alpha, - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* elu_op_out); - -enum xnn_status xnn_reshape_elu_nc_qs8( - xnn_operator_t elu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_elu_nc_qs8( - xnn_operator_t elu_op, - const int8_t* input, - int8_t* output); - -enum xnn_status xnn_create_exp_nc_f32( - uint32_t flags, - xnn_operator_t* exp_op_out); - -enum xnn_status xnn_reshape_exp_nc_f32( - xnn_operator_t exp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_exp_nc_f32( - xnn_operator_t exp_op, - const float* input, - float* output); - -enum xnn_status xnn_create_floor_nc_f16( - uint32_t flags, - xnn_operator_t* floor_op_out); - -enum xnn_status xnn_reshape_floor_nc_f16( - xnn_operator_t floor_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_floor_nc_f16( - xnn_operator_t floor_op, - const void* input, - void* output); - -enum xnn_status xnn_create_floor_nc_f32( - uint32_t flags, - xnn_operator_t* floor_op_out); - -enum xnn_status xnn_reshape_floor_nc_f32( - xnn_operator_t floor_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_floor_nc_f32( - xnn_operator_t floor_op, - const float* input, - float* output); - -enum xnn_status xnn_run_floor_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_fully_connected_nc_f16( - size_t input_channels, - size_t output_channels, +enum xnn_status xnn_create_fully_connected_nc_f16( + size_t input_channels, + size_t output_channels, size_t input_stride, size_t output_stride, const void* kernel, @@ -4530,184 +4008,6 @@ enum xnn_status xnn_setup_fully_connected_nc_qu8( const uint8_t* input, uint8_t* output); -enum xnn_status xnn_create_gelu_nc_f32( - uint32_t flags, - xnn_operator_t* gelu_op_out); - -enum xnn_status xnn_reshape_gelu_nc_f32( - xnn_operator_t gelu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_gelu_nc_f32( - xnn_operator_t gelu_op, - const float* input, - float* output); - -enum xnn_status xnn_run_gelu_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_hardswish_nc_f16( - uint32_t flags, - xnn_operator_t* hardswish_op_out); - -enum xnn_status xnn_reshape_hardswish_nc_f16( - xnn_operator_t hardswish_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_hardswish_nc_f16( - xnn_operator_t hardswish_op, - const void* input, - void* output); - -enum xnn_status xnn_create_hardswish_nc_f32( - uint32_t flags, - xnn_operator_t* hardswish_op_out); - -enum xnn_status xnn_reshape_hardswish_nc_f32( - xnn_operator_t hardswish_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_hardswish_nc_f32( - xnn_operator_t hardswish_op, - const float* input, - float* output); - -enum xnn_status xnn_run_hardswish_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_leaky_relu_nc_f16( - float negative_slope, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out); - -enum xnn_status xnn_reshape_leaky_relu_nc_f16( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_leaky_relu_nc_f16( - xnn_operator_t leaky_relu_op, - const void* input, - void* output); - -enum xnn_status xnn_create_leaky_relu_nc_f32( - float negative_slope, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out); - -enum xnn_status xnn_reshape_leaky_relu_nc_f32( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_leaky_relu_nc_f32( - xnn_operator_t leaky_relu_op, - const float* input, - float* output); - -enum xnn_status xnn_run_leaky_relu_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - float negative_slope, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_leaky_relu_nc_qs8( - float negative_slope, - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out); - -enum xnn_status xnn_reshape_leaky_relu_nc_qs8( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_leaky_relu_nc_qs8( - xnn_operator_t leaky_relu_op, - const int8_t* input, - int8_t* output); - -enum xnn_status xnn_create_leaky_relu_nc_qu8( - float negative_slope, - uint8_t input_zero_point, - float input_scale, - uint8_t output_zero_point, - float output_scale, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out); - -enum xnn_status xnn_create_log_nc_f32( - uint32_t flags, - xnn_operator_t* log_op_out); - -enum xnn_status xnn_reshape_log_nc_f32( - xnn_operator_t log_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_log_nc_f32( - xnn_operator_t log_op, - const float* input, - float* output); - -enum xnn_status xnn_reshape_leaky_relu_nc_qu8( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_leaky_relu_nc_qu8( - xnn_operator_t leaky_relu_op, - const uint8_t* input, - uint8_t* output); enum xnn_status xnn_create_max_pooling2d_nhwc_f16( uint32_t input_padding_top, @@ -4867,50 +4167,6 @@ enum xnn_status xnn_setup_reduce_nd( const void* input, void* output); -enum xnn_status xnn_create_negate_nc_f16( - uint32_t flags, - xnn_operator_t* negate_op_out); - -enum xnn_status xnn_reshape_negate_nc_f16( - xnn_operator_t negate_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_negate_nc_f16( - xnn_operator_t negate_op, - const void* input, - void* output); - -enum xnn_status xnn_create_negate_nc_f32( - uint32_t flags, - xnn_operator_t* negate_op_out); - -enum xnn_status xnn_reshape_negate_nc_f32( - xnn_operator_t negate_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_negate_nc_f32( - xnn_operator_t negate_op, - const float* input, - float* output); - -enum xnn_status xnn_run_negate_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_resize_bilinear2d_nchw_f32( size_t output_height, size_t output_width, @@ -5164,95 +4420,6 @@ enum xnn_status xnn_setup_scaled_dot_product_attention_nhtc_f32( const float* mask, float* output); -enum xnn_status xnn_create_sigmoid_nc_f16( - uint32_t flags, - xnn_operator_t* sigmoid_op_out); - -enum xnn_status xnn_reshape_sigmoid_nc_f16( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_sigmoid_nc_f16( - xnn_operator_t sigmoid_op, - const void* input, - void* output); - -enum xnn_status xnn_create_sigmoid_nc_f32( - uint32_t flags, - xnn_operator_t* sigmoid_op_out); - -enum xnn_status xnn_reshape_sigmoid_nc_f32( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_sigmoid_nc_f32( - xnn_operator_t sigmoid_op, - const float* input, - float* output); - -enum xnn_status xnn_run_sigmoid_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_sigmoid_nc_qs8( - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* sigmoid_op_out); - -enum xnn_status xnn_reshape_sigmoid_nc_qs8( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_sigmoid_nc_qs8( - xnn_operator_t sigmoid_op, - const int8_t* input, - int8_t* output); - -enum xnn_status xnn_create_sigmoid_nc_qu8( - uint8_t input_zero_point, - float input_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* sigmoid_op_out); - -enum xnn_status xnn_reshape_sigmoid_nc_qu8( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_sigmoid_nc_qu8( - xnn_operator_t sigmoid_op, - const uint8_t* input, - uint8_t* output); enum xnn_status xnn_create_slice_nd_x16( uint32_t flags, @@ -5394,211 +4561,6 @@ enum xnn_status xnn_setup_space_to_depth_nhwc_x32( const void* input, void* output); -enum xnn_status xnn_create_square_nc_f16( - uint32_t flags, - xnn_operator_t* square_op_out); - -enum xnn_status xnn_reshape_square_nc_f16( - xnn_operator_t square_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_square_nc_f16( - xnn_operator_t square_op, - const void* input, - void* output); - -enum xnn_status xnn_create_square_nc_f32( - uint32_t flags, - xnn_operator_t* square_op_out); - -enum xnn_status xnn_reshape_square_nc_f32( - xnn_operator_t square_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_square_nc_f32( - xnn_operator_t square_op, - const float* input, - float* output); - -enum xnn_status xnn_run_square_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_square_root_nc_f16( - uint32_t flags, - xnn_operator_t* sqrt_op_out); - -enum xnn_status xnn_reshape_square_root_nc_f16( - xnn_operator_t sqrt_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_square_root_nc_f16( - xnn_operator_t sqrt_op, - const void* input, - void* output); - -enum xnn_status xnn_create_square_root_nc_f32( - uint32_t flags, - xnn_operator_t* sqrt_op_out); - -enum xnn_status xnn_reshape_square_root_nc_f32( - xnn_operator_t sqrt_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_square_root_nc_f32( - xnn_operator_t sqrt_op, - const float* input, - float* output); - -enum xnn_status xnn_run_square_root_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_reciprocal_square_root_nc_f16( - uint32_t flags, xnn_operator_t* sqrt_op_out); - -enum xnn_status xnn_reshape_reciprocal_square_root_nc_f16( - xnn_operator_t sqrt_op, size_t batch_size, size_t channels, - size_t input_stride, size_t output_stride, pthreadpool_t threadpool); - -enum xnn_status xnn_setup_reciprocal_square_root_nc_f16(xnn_operator_t sqrt_op, - const void* input, - void* output); - -enum xnn_status xnn_create_reciprocal_square_root_nc_f32( - uint32_t flags, xnn_operator_t* sqrt_op_out); - -enum xnn_status xnn_reshape_reciprocal_square_root_nc_f32( - xnn_operator_t sqrt_op, size_t batch_size, size_t channels, - size_t input_stride, size_t output_stride, pthreadpool_t threadpool); - -enum xnn_status xnn_setup_reciprocal_square_root_nc_f32(xnn_operator_t sqrt_op, - const float* input, - float* output); - -enum xnn_status xnn_run_reciprocal_square_root_nc_f32( - size_t channels, size_t input_stride, size_t output_stride, - size_t batch_size, const float* input, float* output, uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_tanh_nc_f16( - uint32_t flags, - xnn_operator_t* tanh_op_out); - -enum xnn_status xnn_reshape_tanh_nc_f16( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_tanh_nc_f16( - xnn_operator_t tanh_op, - const void* input, - void* output); - -enum xnn_status xnn_create_tanh_nc_f32( - uint32_t flags, - xnn_operator_t* tanh_op_out); - -enum xnn_status xnn_reshape_tanh_nc_f32( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_tanh_nc_f32( - xnn_operator_t tanh_op, - const float* input, - float* output); - -enum xnn_status xnn_run_tanh_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - -enum xnn_status xnn_create_tanh_nc_qs8( - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* tanh_op_out); - -enum xnn_status xnn_reshape_tanh_nc_qs8( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_tanh_nc_qs8( - xnn_operator_t tanh_op, - const int8_t* input, - int8_t* output); - -enum xnn_status xnn_create_tanh_nc_qu8( - uint8_t input_zero_point, - float input_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* tanh_op_out); - -enum xnn_status xnn_reshape_tanh_nc_qu8( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_tanh_nc_qu8( - xnn_operator_t tanh_op, - const uint8_t* input, - uint8_t* output); - enum xnn_status xnn_create_transpose_nd_x8( uint32_t flags, xnn_operator_t* transpose_op_out); @@ -5699,50 +4661,6 @@ enum xnn_status xnn_run_transpose_nd_x64( uint32_t flags, pthreadpool_t threadpool); -enum xnn_status xnn_create_truncation_nc_f16( - uint32_t flags, - xnn_operator_t* truncation_op_out); - -enum xnn_status xnn_reshape_truncation_nc_f16( - xnn_operator_t truncation_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_truncation_nc_f16( - xnn_operator_t truncation_op, - const void* input, - void* output); - -enum xnn_status xnn_create_truncation_nc_f32( - uint32_t flags, - xnn_operator_t* truncation_op_out); - -enum xnn_status xnn_reshape_truncation_nc_f32( - xnn_operator_t truncation_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_truncation_nc_f32( - xnn_operator_t truncation_op, - const float* input, - float* output); - -enum xnn_status xnn_run_truncation_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool); - enum xnn_status xnn_create_unpooling2d_nhwc_x32( uint32_t input_padding_top, uint32_t input_padding_right, diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index de5302fdba67..1f1c2474ceae 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -132,48 +132,48 @@ tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --ukernel s32- tools/generate-vbinary-test.py --tester VBinaryMicrokernelTester --broadcast_b --ukernel s32-vmulc --output test/s32-vmulc.cc & ### Tests for VUnary micro-kernels -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel bf16-vabs --output test/bf16-vabs.cc & - -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vclamp --output test/f16-vclamp.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-velu --output test/f16-velu.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vabs --output test/f16-vabs.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vneg --output test/f16-vneg.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vsqr --output test/f16-vsqr.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vrndne --output test/f16-vrndne.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vrndz --output test/f16-vrndz.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vrndu --output test/f16-vrndu.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vrndd --output test/f16-vrndd.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vrsqrt --output test/f16-vrsqrt.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vsigmoid --output test/f16-vsigmoid.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vsqrt --output test/f16-vsqrt.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vtanh --output test/f16-vtanh.cc & - -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vabs --output test/f32-vabs.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vclamp --output test/f32-vclamp.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-velu --output test/f32-velu.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vgelu --output test/f32-vgelu.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vexp --output test/f32-vexp.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vlog --output test/f32-vlog.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vneg --output test/f32-vneg.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vrelu --output test/f32-vrelu.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vrndd --output test/f32-vrndd.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vrndne --output test/f32-vrndne.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vrndu --output test/f32-vrndu.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vrndz --output test/f32-vrndz.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vrsqrt --output test/f32-vrsqrt.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vsigmoid --output test/f32-vsigmoid.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vsqr --output test/f32-vsqr.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vsqrt --output test/f32-vsqrt.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vtanh --output test/f32-vtanh.cc & - -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel s8-vclamp --output test/s8-vclamp.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel u8-vclamp --output test/u8-vclamp.cc & +tools/generate-vunary-test.py --ukernel bf16-vabs --output test/bf16-vabs.cc & + +tools/generate-vunary-test.py --ukernel f16-vclamp --output test/f16-vclamp.cc & +tools/generate-vunary-test.py --ukernel f16-velu --output test/f16-velu.cc & +tools/generate-vunary-test.py --ukernel f16-vabs --output test/f16-vabs.cc & +tools/generate-vunary-test.py --ukernel f16-vneg --output test/f16-vneg.cc & +tools/generate-vunary-test.py --ukernel f16-vsqr --output test/f16-vsqr.cc & +tools/generate-vunary-test.py --ukernel f16-vrndne --output test/f16-vrndne.cc & +tools/generate-vunary-test.py --ukernel f16-vrndz --output test/f16-vrndz.cc & +tools/generate-vunary-test.py --ukernel f16-vrndu --output test/f16-vrndu.cc & +tools/generate-vunary-test.py --ukernel f16-vrndd --output test/f16-vrndd.cc & +tools/generate-vunary-test.py --ukernel f16-vrsqrt --output test/f16-vrsqrt.cc & +tools/generate-vunary-test.py --ukernel f16-vsigmoid --output test/f16-vsigmoid.cc & +tools/generate-vunary-test.py --ukernel f16-vsqrt --output test/f16-vsqrt.cc & +tools/generate-vunary-test.py --ukernel f16-vtanh --output test/f16-vtanh.cc & + +tools/generate-vunary-test.py --ukernel f32-vabs --output test/f32-vabs.cc & +tools/generate-vunary-test.py --ukernel f32-vclamp --output test/f32-vclamp.cc & +tools/generate-vunary-test.py --ukernel f32-velu --output test/f32-velu.cc & +tools/generate-vunary-test.py --ukernel f32-vgelu --output test/f32-vgelu.cc & +tools/generate-vunary-test.py --ukernel f32-vexp --output test/f32-vexp.cc & +tools/generate-vunary-test.py --ukernel f32-vlog --output test/f32-vlog.cc & +tools/generate-vunary-test.py --ukernel f32-vneg --output test/f32-vneg.cc & +tools/generate-vunary-test.py --ukernel f32-vrelu --output test/f32-vrelu.cc & +tools/generate-vunary-test.py --ukernel f32-vrndd --output test/f32-vrndd.cc & +tools/generate-vunary-test.py --ukernel f32-vrndne --output test/f32-vrndne.cc & +tools/generate-vunary-test.py --ukernel f32-vrndu --output test/f32-vrndu.cc & +tools/generate-vunary-test.py --ukernel f32-vrndz --output test/f32-vrndz.cc & +tools/generate-vunary-test.py --ukernel f32-vrsqrt --output test/f32-vrsqrt.cc & +tools/generate-vunary-test.py --ukernel f32-vsigmoid --output test/f32-vsigmoid.cc & +tools/generate-vunary-test.py --ukernel f32-vsqr --output test/f32-vsqr.cc & +tools/generate-vunary-test.py --ukernel f32-vsqrt --output test/f32-vsqrt.cc & +tools/generate-vunary-test.py --ukernel f32-vtanh --output test/f32-vtanh.cc & + +tools/generate-vunary-test.py --ukernel s8-vclamp --output test/s8-vclamp.cc & +tools/generate-vunary-test.py --ukernel u8-vclamp --output test/u8-vclamp.cc & ### Tests for VLRelu micro-kernels -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vlrelu --output test/f16-vlrelu.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vlrelu --output test/f32-vlrelu.cc & -tools/generate-vunary-test.py --tester VLReLUMicrokernelTester --ukernel qs8-vlrelu --output test/qs8-vlrelu.cc & -tools/generate-vunary-test.py --tester VLReLUMicrokernelTester --ukernel qu8-vlrelu --output test/qu8-vlrelu.cc & +tools/generate-vunary-test.py --ukernel f16-vlrelu --output test/f16-vlrelu.cc & +tools/generate-vunary-test.py --ukernel f32-vlrelu --output test/f32-vlrelu.cc & +tools/generate-vunary-test.py --ukernel qs8-vlrelu --output test/qs8-vlrelu.cc & +tools/generate-vunary-test.py --ukernel qu8-vlrelu --output test/qu8-vlrelu.cc & ### Tests for Reduce micro-kernels tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f16-rmax.yaml --output test/f16-rmax.cc & @@ -235,10 +235,10 @@ tools/generate-dwconv2d-chw-test.py --spec test/f16-dwconv2d-chw.yaml --output t tools/generate-dwconv2d-chw-test.py --spec test/f32-dwconv2d-chw.yaml --output test/f32-dwconv2d-chw.cc & ### Tests for VHSwish micro-kernels -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f16-vhswish --output test/f16-vhswish.cc & -tools/generate-vunary-test.py --tester VUnaryMicrokernelTester --ukernel f32-vhswish --output test/f32-vhswish.cc & -tools/generate-vunary-test.py --tester VHSwishMicrokernelTester --ukernel qs8-vhswish --output test/qs8-vhswish.cc & -tools/generate-vunary-test.py --tester VHSwishMicrokernelTester --ukernel qu8-vhswish --output test/qu8-vhswish.cc & +tools/generate-vunary-test.py --ukernel f16-vhswish --output test/f16-vhswish.cc & +tools/generate-vunary-test.py --ukernel f32-vhswish --output test/f32-vhswish.cc & +tools/generate-vunary-test.py --ukernel qs8-vhswish --output test/qs8-vhswish.cc & +tools/generate-vunary-test.py --ukernel qu8-vhswish --output test/qu8-vhswish.cc & ### Tests for IBilinear micro-kernels tools/generate-ibilinear-test.py --spec test/f16-ibilinear.yaml --output test/f16-ibilinear.cc & diff --git a/src/bf16-vabs/bf16-vabs.h b/src/bf16-vabs/bf16-vabs.h index 5194fae6d76f..af379e98f3dd 100644 --- a/src/bf16-vabs/bf16-vabs.h +++ b/src/bf16-vabs/bf16-vabs.h @@ -17,9 +17,9 @@ #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_bf16, xnn_bf16_vabs_ukernel__neonbf16_u8, 8, false, uint16_t, struct xnn_bf16_default_params, ((xnn_init_bf16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_bf16, xnn_bf16_vabs_ukernel__neonbf16_u16, 16, false, uint16_t, struct xnn_bf16_default_params, ((xnn_init_bf16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_bf16, xnn_bf16_vabs_ukernel__neonbf16_u24, 24, false, uint16_t, struct xnn_bf16_default_params, ((xnn_init_bf16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_bf16, xnn_bf16_vabs_ukernel__neonbf16_u8, 8, false, uint16_t, struct xnn_bf16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_bf16, xnn_bf16_vabs_ukernel__neonbf16_u16, 16, false, uint16_t, struct xnn_bf16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_bf16, xnn_bf16_vabs_ukernel__neonbf16_u24, 24, false, uint16_t, struct xnn_bf16_default_params, NULL) #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index ff36ce67e757..cdf6e100c991 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -150,21 +150,21 @@ static void init_f16_clamp_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { f16_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vclamp_ukernel__neonfp16arith_u16; - f16_clamp_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_clamp_scalar_params; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { f16_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vclamp_ukernel__neonfp16arith_u16; - f16_clamp_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_clamp_scalar_params; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_f16c) { f16_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vclamp_ukernel__f16c_u16; - f16_clamp_config.init.f16_minmax = xnn_init_f16_minmax_scalar_params; + f16_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_clamp_scalar_params; } #endif } @@ -175,21 +175,21 @@ static void init_f16_elu_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { f16_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_u16; - f16_elu_config.init.f16_elu = xnn_init_f16_elu_scalar_params; + f16_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_elu_scalar_params; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { f16_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_u16; - f16_elu_config.init.f16_elu = xnn_init_f16_elu_scalar_params; + f16_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_elu_scalar_params; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { f16_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_velu_ukernel__avx2_rr1_p3_u16; - f16_elu_config.init.f16_elu = xnn_init_f16_elu_scalar_params; + f16_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_elu_scalar_params; } #endif } @@ -222,21 +222,21 @@ static void init_f16_lrelu_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { f16_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vlrelu_ukernel__neonfp16arith_u16; - f16_lrelu_config.init.f16_lrelu = xnn_init_f16_lrelu_scalar_params; + f16_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_lrelu_scalar_params; } #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { f16_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vlrelu_ukernel__neonfp16arith_u16; - f16_lrelu_config.init.f16_lrelu = xnn_init_f16_lrelu_scalar_params; + f16_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_lrelu_scalar_params; } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_f16c) { f16_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vlrelu_ukernel__f16c_u16; - f16_lrelu_config.init.f16_lrelu = xnn_init_f16_lrelu_scalar_params; + f16_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_lrelu_scalar_params; } #endif } @@ -453,10 +453,8 @@ static void init_f16_tanh_config(void) { assert(hardware_config != NULL); if (hardware_config->use_x86_fma3) { f16_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vtanh_ukernel__fma3_polynomial_p19h9t2_u32; - f16_tanh_config.init.f16_tanh = NULL; } else if (hardware_config->use_x86_f16c) { f16_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vtanh_ukernel__f16c_expm1minus_rr1_p3h2ts_rcp_u72; - f16_tanh_config.init.f16_tanh = NULL; } #endif } @@ -514,11 +512,11 @@ static void init_f16_to_qs8_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { f16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_qs8_vcvt_ukernel__neonfp16arith_u32; - f16_to_qs8_cvt_config.init.f16_qs8_cvt = xnn_init_f16_qs8_cvt_scalar_params; + f16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_qs8_cvt_scalar_params; } #else f16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_qs8_vcvt_ukernel__scalar_imagic_u4; - f16_to_qs8_cvt_config.init.f16_qs8_cvt = xnn_init_f16_qs8_cvt_scalar_params; + f16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f16_qs8_cvt_scalar_params; #endif } @@ -563,49 +561,49 @@ static void init_f32_clamp_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__neon_u16; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__scalar_u4; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; } #elif XNN_ARCH_ARM64 f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__neon_u16; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512F if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx512f_u16; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; } else #endif if (hardware_config->use_x86_avx) { f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx_u16; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; } else { f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__sse_u8; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__wasmsimd_x86_u8; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; } else { f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__wasmsimd_arm_u8; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; } #elif XNN_ARCH_WASM f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__wasm_u4; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; #elif XNN_ARCH_RISCV f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__scalar_u4; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; #else f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__scalar_u4; - f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params; + f32_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_clamp_scalar_params; #endif } @@ -616,50 +614,50 @@ static void init_f32_elu_config(void) { if (hardware_config->use_arm_neon) { if (hardware_config->use_arm_neon_fma) { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__neonfma_rr1_p6_u8; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } else { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_u8; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } } else if (!XNN_PLATFORM_MOBILE) { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } #elif XNN_ARCH_ARM64 f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_u16; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512F if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx512f_rr1_p6_u128; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_u56; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } else if (hardware_config->use_x86_avx) { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_u32; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } else { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_u12; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasmrelaxedsimd_fma_rr2_p6_u24; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; #else const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_u20; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } else { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_u20; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } #endif #elif XNN_ARCH_WASM @@ -667,17 +665,17 @@ static void init_f32_elu_config(void) { assert(hardware_config != NULL); if (hardware_config->is_x86) { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u2; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } else { f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__wasm_rr2_p6_u6; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; } #elif XNN_ARCH_RISCV f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; #else f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4; - f32_elu_config.init.f32_elu = xnn_init_f32_elu_scalar_params; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; #endif } @@ -808,32 +806,32 @@ static void init_f32_lrelu_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__neon_u8; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__scalar_u4; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } #elif XNN_ARCH_ARM64 f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__neon_u8; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512F if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx512f_u16; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } else #endif if (hardware_config->use_x86_avx) { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx_u16; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } else if (hardware_config->use_x86_sse4_1) { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__sse41_u8; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } else { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__sse_u8; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -841,30 +839,31 @@ static void init_f32_lrelu_config(void) { #if XNN_ARCH_WASMRELAXEDSIMD if (hardware_config->is_x86) { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmrelaxedsimd_iminmax_u4; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } else { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmrelaxedsimd_laneselect_u4; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } #else if (hardware_config->is_x86) { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmsimd_iminmax_u8; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } else { f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__wasmsimd_laneselect_u8; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; } #endif #elif XNN_ARCH_WASM f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__scalar_u4; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__rvv_u4v; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.element_tile = hardware_config->vlenb / sizeof(float) * 4; // (VLENB/sizeof)*LMUL #else f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__scalar_u4; - f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params; + f32_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_lrelu_scalar_params; #endif } @@ -1348,60 +1347,60 @@ static void init_f32_to_qs8_cvt_config(void) { if (hardware_config->use_arm_neon) { if (hardware_config->use_arm_neon_v8) { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__neonv8_u32; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } else { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__neon_u32; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } } else if (!XNN_PLATFORM_MOBILE) { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u4; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } #elif XNN_ARCH_ARM64 f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__neonv8_u32; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512SKX if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx512skx_u128; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx2_u64; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } else if (hardware_config->use_x86_avx) { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx_u32; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } else if (hardware_config->use_x86_sse4_1) { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__sse41_u32; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } else { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__sse2_u32; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_u32; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; #elif XNN_ARCH_WASM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_u1; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } else { f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__wasm_fmagic_u4; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; } #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__rvv_u2v; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; #else f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_u4; - f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_params; + f32_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qs8_cvt_scalar_params; #endif } @@ -1412,57 +1411,57 @@ static void init_f32_to_qu8_cvt_config(void) { if (hardware_config->use_arm_neon) { if (hardware_config->use_arm_neon_v8) { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__neonv8_u32; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } else { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__neon_u32; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } } else if (!XNN_PLATFORM_MOBILE) { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u4; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } #elif XNN_ARCH_ARM64 f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__neonv8_u32; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512SKX if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx512skx_u128; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx2_u64; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } else if (hardware_config->use_x86_avx) { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx_u32; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } else { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__sse2_u32; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_u32; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; #elif XNN_ARCH_WASM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_u1; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } else { f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_u4; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; } #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__rvv_u2v; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; #else f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_u4; - f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_params; + f32_to_qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_qu8_cvt_scalar_params; #endif } @@ -1472,10 +1471,10 @@ static void init_s32_to_f32_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { s32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s32_f32_vcvt_ukernel__neon_u16; - s32_to_f32_cvt_config.init.s32_f32_cvt = xnn_init_s32_f32_cvt_scalar_params; + s32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_s32_f32_cvt_scalar_params; } else { s32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s32_f32_vcvt_ukernel__scalar_u4; - s32_to_f32_cvt_config.init.s32_f32_cvt = xnn_init_s32_f32_cvt_scalar_params; + s32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_s32_f32_cvt_scalar_params; } #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1483,22 +1482,22 @@ static void init_s32_to_f32_cvt_config(void) { #if XNN_ENABLE_AVX512F if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { s32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s32_f32_vcvt_ukernel__avx512f_u64; - s32_to_f32_cvt_config.init.s32_f32_cvt = xnn_init_s32_f32_cvt_scalar_params; + s32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_s32_f32_cvt_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { s32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s32_f32_vcvt_ukernel__avx2_u32; - s32_to_f32_cvt_config.init.s32_f32_cvt = xnn_init_s32_f32_cvt_scalar_params; + s32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_s32_f32_cvt_scalar_params; } else { s32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s32_f32_vcvt_ukernel__scalar_u4; - s32_to_f32_cvt_config.init.s32_f32_cvt = xnn_init_s32_f32_cvt_scalar_params; + s32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_s32_f32_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD s32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s32_f32_vcvt_ukernel__wasmsimd_u16; - s32_to_f32_cvt_config.init.s32_f32_cvt = xnn_init_s32_f32_cvt_scalar_params; + s32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_s32_f32_cvt_scalar_params; #else s32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s32_f32_vcvt_ukernel__scalar_u4; - s32_to_f32_cvt_config.init.s32_f32_cvt = xnn_init_s32_f32_cvt_scalar_params; + s32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_s32_f32_cvt_scalar_params; #endif } @@ -1508,10 +1507,10 @@ static void init_u32_to_f32_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__neon_u16; - u32_to_f32_cvt_config.init.u32_f32_cvt = xnn_init_u32_f32_cvt_scalar_params; + u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params; } else { u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__scalar_u4; - u32_to_f32_cvt_config.init.u32_f32_cvt = xnn_init_u32_f32_cvt_scalar_params; + u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params; } #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1519,22 +1518,22 @@ static void init_u32_to_f32_cvt_config(void) { #if XNN_ENABLE_AVX512F if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__avx512f_u64; - u32_to_f32_cvt_config.init.u32_f32_cvt = xnn_init_u32_f32_cvt_scalar_params; + u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__avx2_u32; - u32_to_f32_cvt_config.init.u32_f32_cvt = xnn_init_u32_f32_cvt_scalar_params; + u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params; } else { u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__scalar_u4; - u32_to_f32_cvt_config.init.u32_f32_cvt = xnn_init_u32_f32_cvt_scalar_params; + u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__wasmsimd_u16; - u32_to_f32_cvt_config.init.u32_f32_cvt = xnn_init_u32_f32_cvt_scalar_params; + u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params; #else u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__scalar_u4; - u32_to_f32_cvt_config.init.u32_f32_cvt = xnn_init_u32_f32_cvt_scalar_params; + u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params; #endif } @@ -1544,57 +1543,57 @@ static void init_qs8_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon_v8) { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__neon_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } else { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__armsimd32_u8; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } #elif XNN_ARCH_ARM64 qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__neon_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__avx2_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } else if (hardware_config->use_x86_avx) { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__avx_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__sse41_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } else if (hardware_config->use_x86_ssse3) { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__ssse3_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } else { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__sse2_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_u32; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; #else qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__wasmsimd_u16; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; #endif #elif XNN_ARCH_WASM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_u1; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } else { qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_u4; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; } #elif XNN_ARCH_RISCV qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_u4; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; #else qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vcvt_ukernel__scalar_u4; - qs8_cvt_config.init.qs8_cvt = xnn_init_qs8_cvt_scalar_params; + qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_cvt_scalar_params; #endif } @@ -1604,36 +1603,36 @@ static void init_qs16_to_qs8_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__asm_aarch32_neon_u16; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__scalar_u4; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; } #elif XNN_ARCH_ARM64 qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__neon_u32; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx) { qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__avx_u16; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__sse41_u16; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; } else if (hardware_config->use_x86_ssse3) { qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__ssse3_u16; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; } else { qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__sse2_u16; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__wasmsimd_u16; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; #else qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__scalar_u4; - qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params; + qs16_to_qs8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs16_qs8_cvt_scalar_params; #endif } @@ -1643,32 +1642,32 @@ static void init_qs8_lrelu_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__neon_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__armsimd32_u4; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } #elif XNN_ARCH_ARM64 qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__neon_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__avx2_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else if (hardware_config->use_x86_avx) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__avx_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__sse41_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__ssse3_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__sse2_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1676,18 +1675,18 @@ static void init_qs8_lrelu_config(void) { #if XNN_ARCH_WASMRELAXEDSIMD if (hardware_config->is_x86) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } #else if (hardware_config->is_x86) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmsimd_x86_u16; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__wasmsimd_arm_u32; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } #endif #elif XNN_ARCH_WASM @@ -1695,18 +1694,18 @@ static void init_qs8_lrelu_config(void) { assert(hardware_config != NULL); if (hardware_config->is_x86) { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__scalar_select_u4; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } else { qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__scalar_andxor_u4; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; } #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__rvv_u2v; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; #else qs8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_vlrelu_ukernel__scalar_andxor_u4; - qs8_lrelu_config.init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_params; + qs8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_lrelu_scalar_params; #endif } @@ -1716,14 +1715,14 @@ static void init_qs8_to_f16_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon_fp16_arith) { qs8_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f16_vcvt_ukernel__neonfp16arith_u32; - qs8_to_f16_cvt_config.init.qs8_f16_cvt = xnn_init_qs8_f16_cvt_scalar_params; + qs8_to_f16_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f16_cvt_scalar_params; } #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { qs8_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f16_vcvt_ukernel__avx2_u16; - qs8_to_f16_cvt_config.init.qs8_f16_cvt = xnn_init_qs8_f16_cvt_scalar_params; + qs8_to_f16_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f16_cvt_scalar_params; } #endif } @@ -1734,49 +1733,49 @@ static void init_qs8_to_f32_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__neon_u32; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_u4; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; } #elif XNN_ARCH_ARM64 qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__neon_u32; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512SKX if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx512skx_u32; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx2_u16; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; } else if (hardware_config->use_x86_avx) { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx_u32; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__sse41_u16; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; } else { qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__sse2_u32; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__wasmsimd_u32; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; #elif XNN_ARCH_WASM qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_u1; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__rvv_u2v; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; #else qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__scalar_u4; - qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params; + qs8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_f32_cvt_scalar_params; #endif } @@ -1786,57 +1785,57 @@ static void init_qu8_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__neon_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__armsimd32_u8; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } #elif XNN_ARCH_ARM64 qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__neon_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__avx2_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } else if (hardware_config->use_x86_avx) { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__avx_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__sse41_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } else if (hardware_config->use_x86_ssse3) { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__ssse3_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } else { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__sse2_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_u32; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; #else qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__wasmsimd_u16; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; #endif #elif XNN_ARCH_WASM const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->is_x86) { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_u1; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } else { qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_u4; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; } #elif XNN_ARCH_RISCV qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_u4; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; #else qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vcvt_ukernel__scalar_u4; - qu8_cvt_config.init.qu8_cvt = xnn_init_qu8_cvt_scalar_params; + qu8_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_cvt_scalar_params; #endif } @@ -1846,32 +1845,32 @@ static void init_qu8_lrelu_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__neon_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__armsimd32_u4; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } #elif XNN_ARCH_ARM64 qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__neon_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); if (hardware_config->use_x86_avx2) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__avx2_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else if (hardware_config->use_x86_avx) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__avx_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__sse41_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__ssse3_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__sse2_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); @@ -1879,18 +1878,18 @@ static void init_qu8_lrelu_config(void) { #if XNN_ARCH_WASMRELAXEDSIMD if (hardware_config->is_x86) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } #else if (hardware_config->is_x86) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmsimd_x86_u16; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__wasmsimd_arm_u32; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } #endif #elif XNN_ARCH_WASM @@ -1898,18 +1897,18 @@ static void init_qu8_lrelu_config(void) { assert(hardware_config != NULL); if (hardware_config->is_x86) { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__scalar_select_u4; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } else { qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__scalar_andxor_u4; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; } #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__rvv_u2v; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; #else qu8_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_vlrelu_ukernel__scalar_andxor_u4; - qu8_lrelu_config.init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_params; + qu8_lrelu_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_lrelu_scalar_params; #endif } @@ -1919,49 +1918,49 @@ static void init_qu8_to_f32_cvt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__neon_u32; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_u4; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; } #elif XNN_ARCH_ARM64 qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__neon_u32; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512SKX if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx512skx_u32; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx2_u16; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; } else if (hardware_config->use_x86_avx) { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx_u32; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; } else if (hardware_config->use_x86_sse4_1) { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__sse41_u16; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; } else { qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__sse2_u32; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__wasmsimd_u32; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; #elif XNN_ARCH_WASM qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_u1; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__rvv_u2v; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; #else qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__scalar_u4; - qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params; + qu8_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_f32_cvt_scalar_params; #endif } @@ -1971,43 +1970,43 @@ static void init_s8_clamp_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__neon_u64; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; } #elif XNN_ARCH_ARM64 s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__neon_u64; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__rvv_u4v; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512SKX if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__avx512skx_u256; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__avx2_u128; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; } else if (hardware_config->use_x86_sse4_1) { s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__sse41_u64; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; } else { s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__sse2_u64; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__wasmsimd_u64; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; #else s8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_s8_vclamp_ukernel__scalar_u4; - s8_clamp_config.init.s8_minmax = xnn_init_s8_minmax_scalar_params; + s8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qs8_clamp_scalar_params; #endif } @@ -2017,45 +2016,45 @@ static void init_u8_clamp_config(void) { assert(hardware_config != NULL); if (hardware_config->use_arm_neon) { u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__neon_u64; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; } else if (!XNN_PLATFORM_MOBILE) { u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__scalar_u4; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; } #elif XNN_ARCH_ARM64 u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__neon_u64; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__rvv_u4v; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); assert(hardware_config != NULL); #if XNN_ENABLE_AVX512SKX if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__avx512skx_u256; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; } else #endif if (hardware_config->use_x86_avx2) { u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__avx2_u128; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; } else { u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__sse2_u64; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__wasmsimd_u64; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; #elif XNN_ARCH_WASM u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__scalar_u4; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; #elif XNN_ARCH_RISCV u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__scalar_u4; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; #else u8_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u8_vclamp_ukernel__scalar_u4; - u8_clamp_config.init.u8_minmax = xnn_init_u8_minmax_scalar_params; + u8_clamp_config.init = (xnn_init_unary_uparams_fn) xnn_init_qu8_clamp_scalar_params; #endif } diff --git a/src/f16-vabs/f16-vabs.h b/src/f16-vabs/f16-vabs.h index 04c7f2c70d47..b91104b3ca63 100644 --- a/src/f16-vabs/f16-vabs.h +++ b/src/f16-vabs/f16-vabs.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vabs_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vabs_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vabs_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vabs_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vabs_ukernel__sse2_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vabs_ukernel__sse2_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vabs_ukernel__sse2_u8, 8, false, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vabs_ukernel__sse2_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vclamp/f16-vclamp.h b/src/f16-vclamp/f16-vclamp.h index 2568f6dafc34..d39f61ff21a4 100644 --- a/src/f16-vclamp/f16-vclamp.h +++ b/src/f16-vclamp/f16-vclamp.h @@ -17,20 +17,20 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vclamp_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vclamp_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vclamp_ukernel__neonfp16arith_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vclamp_ukernel__neonfp16arith_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_RISCV_FP16_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u1v, 1, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u2v, 2, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u1v, 1, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u2v, 2, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u4v, 4, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector_fp16_arith, xnn_f16_vclamp_ukernel__rvvfp16arith_u8v, 8, true, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) #endif // XNN_ENABLE_RISCV_FP16_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vclamp_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vclamp_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vclamp_ukernel__f16c_u8, 8, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vclamp_ukernel__f16c_u16, 16, false, xnn_float16, union xnn_f16_minmax_params, xnn_init_f16_clamp_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vhswish/f16-vhswish.h b/src/f16-vhswish/f16-vhswish.h index b2b45962ccb2..6897df30b6b9 100644 --- a/src/f16-vhswish/f16-vhswish.h +++ b/src/f16-vhswish/f16-vhswish.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vhswish_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_hswish_params, ((xnn_init_f16_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vhswish_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_hswish_params, ((xnn_init_f16_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vhswish_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vhswish_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_hswish_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_hswish_params, ((xnn_init_f16_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_hswish_params, ((xnn_init_f16_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vhswish_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_hswish_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vneg/f16-vneg.h b/src/f16-vneg/f16-vneg.h index 5b0ec181a7fe..f9c400747b28 100644 --- a/src/f16-vneg/f16-vneg.h +++ b/src/f16-vneg/f16-vneg.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vneg_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vneg_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vneg_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vneg_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vneg_ukernel__sse2_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vneg_ukernel__sse2_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vneg_ukernel__sse2_u8, 8, false, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f16_vneg_ukernel__sse2_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vrnd/f16-vrndd.h b/src/f16-vrnd/f16-vrndd.h index 23a41e1efab7..b421d1783e6a 100644 --- a/src/f16-vrnd/f16-vrndd.h +++ b/src/f16-vrnd/f16-vrndd.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndd_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndd_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndd_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndd_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndd_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vrnd/f16-vrndne.h b/src/f16-vrnd/f16-vrndne.h index 1e3a587d7c81..9fcad342d817 100644 --- a/src/f16-vrnd/f16-vrndne.h +++ b/src/f16-vrnd/f16-vrndne.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndne_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndne_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndne_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndne_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndne_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vrnd/f16-vrndu.h b/src/f16-vrnd/f16-vrndu.h index 70636acc0b4e..2cd3e92019b1 100644 --- a/src/f16-vrnd/f16-vrndu.h +++ b/src/f16-vrnd/f16-vrndu.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndu_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndu_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndu_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndu_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndu_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vrnd/f16-vrndz.h b/src/f16-vrnd/f16-vrndz.h index 955c2eca5ce7..02f9a8d35783 100644 --- a/src/f16-vrnd/f16-vrndz.h +++ b/src/f16-vrnd/f16-vrndz.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndz_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndz_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndz_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrndz_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, ((xnn_init_f16_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrndz_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vrsqrt/f16-vrsqrt.h b/src/f16-vrsqrt/f16-vrsqrt.h index 773b98d5339a..aeddb29aa28e 100644 --- a/src/f16-vrsqrt/f16-vrsqrt.h +++ b/src/f16-vrsqrt/f16-vrsqrt.h @@ -17,15 +17,15 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsqrt_ukernel__neonfp16arith_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_rsqrt_params, ((xnn_init_f16_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsqrt_ukernel__neonfp16arith_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_rsqrt_params, ((xnn_init_f16_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsqrt_ukernel__neonfp16arith_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_rsqrt_params, ((xnn_init_f16_rsqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsqrt_ukernel__neonfp16arith_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsqrt_ukernel__neonfp16arith_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vrsqrt_ukernel__neonfp16arith_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_rsqrt_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_rsqrt_params, ((xnn_init_f16_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_rsqrt_params, ((xnn_init_f16_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_rsqrt_params, ((xnn_init_f16_rsqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vrsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_rsqrt_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vsigmoid/f16-vsigmoid.h b/src/f16-vsigmoid/f16-vsigmoid.h index 9e8e012a1dc3..061fac57f22f 100644 --- a/src/f16-vsigmoid/f16-vsigmoid.h +++ b/src/f16-vsigmoid/f16-vsigmoid.h @@ -17,52 +17,52 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__aarch64_neonfp16arith_rr2_p2_div_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1recps_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, ((xnn_init_f16_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_div_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u8, 8, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u16, 16, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u24, 24, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u32, 32, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u40, 40, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u48, 48, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u56, 56, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_u64, 64, false, xnn_float16, struct xnn_f16_sigmoid_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vsqr/f16-vsqr.h b/src/f16-vsqr/f16-vsqr.h index 6c9c5842a777..aa8769a46c99 100644 --- a/src/f16-vsqr/f16-vsqr.h +++ b/src/f16-vsqr/f16-vsqr.h @@ -17,13 +17,13 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqr_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqr_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqr_ukernel__neonfp16arith_u8, 8, false, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqr_ukernel__neonfp16arith_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqr_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqr_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, ((xnn_init_f16_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqr_ukernel__f16c_u8, 8, false, xnn_float16, struct xnn_f16_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqr_ukernel__f16c_u16, 16, false, xnn_float16, struct xnn_f16_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f16-vsqrt/f16-vsqrt.h b/src/f16-vsqrt/f16-vsqrt.h index 7bb5773a6932..fc8f029f7a56 100644 --- a/src/f16-vsqrt/f16-vsqrt.h +++ b/src/f16-vsqrt/f16-vsqrt.h @@ -17,42 +17,42 @@ #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__aarch64_neonfp16arith_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fp16_arith, xnn_f16_vsqrt_ukernel__neonfp16arith_nr1fma1adj_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsqrt_ukernel__fp16arith_sqrt_u1, 1, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsqrt_ukernel__fp16arith_sqrt_u2, 2, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsqrt_ukernel__fp16arith_sqrt_u4, 4, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsqrt_ukernel__fp16arith_sqrt_u1, 1, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsqrt_ukernel__fp16arith_sqrt_u2, 2, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_fp16_arith, xnn_f16_vsqrt_ukernel__fp16arith_sqrt_u4, 4, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) #endif // XNN_ENABLE_ARM_FP16_SCALAR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) #if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp16_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp16_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp16_sqrt_u128, 128, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp16_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp16_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512fp16, xnn_f16_vsqrt_ukernel__avx512fp16_sqrt_u128, 128, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) #endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_rsqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u8, 8, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_f16c, xnn_f16_vsqrt_ukernel__f16c_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, ((xnn_init_f16_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u16, 16, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u32, 32, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_f16_vsqrt_ukernel__avx512skx_sqrt_u64, 64, false, xnn_float16, struct xnn_f16_sqrt_params, NULL) #endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) diff --git a/src/f32-vabs/f32-vabs.h b/src/f32-vabs/f32-vabs.h index 4bc0b787b98a..e48a91aa8f38 100644 --- a/src/f32-vabs/f32-vabs.h +++ b/src/f32-vabs/f32-vabs.h @@ -17,48 +17,48 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vabs_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vabs_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vabs_ukernel__neon_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vabs_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vabs_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vabs_ukernel__neon_u12, 12, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vabs_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__sse2_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vabs_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vabs_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vabs_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__wasmsimd_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__wasmsimd_u12, 12, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vabs_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vclamp/f32-vclamp.h b/src/f32-vclamp/f32-vclamp.h index a801e8efe4aa..6d7e545bb666 100644 --- a/src/f32-vclamp/f32-vclamp.h +++ b/src/f32-vclamp/f32-vclamp.h @@ -17,46 +17,46 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vclamp_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vclamp_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vclamp_ukernel__neon_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vclamp_ukernel__neon_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vclamp_ukernel__neon_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vclamp_ukernel__neon_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u1v, 1, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u2v, 2, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u1v, 1, true, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u2v, 2, true, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u4v, 4, true, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vclamp_ukernel__rvv_u8v, 8, true, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__sse_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vclamp_ukernel__avx_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u16, 16, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vclamp_ukernel__avx512f_u32, 32, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_arm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_arm_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_x86_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasmsimd_x86_u8, 8, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasm_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasm_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__wasm_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__scalar_u1, 1, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__scalar_u2, 2, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vclamp_ukernel__scalar_u4, 4, false, float, union xnn_f32_minmax_params, xnn_init_f32_clamp_scalar_params) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vexp/f32-vexp.h b/src/f32-vexp/f32-vexp.h index 253db103dd7a..dd8bbe04329c 100644 --- a/src/f32-vexp/f32-vexp.h +++ b/src/f32-vexp/f32-vexp.h @@ -15,9 +15,9 @@ #define XNN_DEFINED_UKERNEL #endif -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vexp_ukernel__scalar_exp_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vexp_ukernel__scalar_exp_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vexp_ukernel__scalar_exp_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vexp_ukernel__scalar_exp_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vexp_ukernel__scalar_exp_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vexp_ukernel__scalar_exp_u4, 4, false, float, struct xnn_f32_default_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vgelu/f32-vgelu.h b/src/f32-vgelu/f32-vgelu.h index 8c60243f2bd5..c54d7b8b4c3e 100644 --- a/src/f32-vgelu/f32-vgelu.h +++ b/src/f32-vgelu/f32-vgelu.h @@ -15,58 +15,58 @@ #define XNN_DEFINED_UKERNEL #endif -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__sse2_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u24, 24, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vgelu_ukernel__avx_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u24, 24, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vgelu_ukernel__fma3_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u48, 48, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_div_u64, 64, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u48, 48, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vgelu_ukernel__avx512f_rational_12_10_nr_u64, 64, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vgelu_ukernel__neon_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u64, 64, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u128, 128, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vgelu_ukernel__wasmsimd_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD diff --git a/src/f32-vhswish/f32-vhswish.h b/src/f32-vhswish/f32-vhswish.h index fa31e9d5b92c..1921f81d3dc0 100644 --- a/src/f32-vhswish/f32-vhswish.h +++ b/src/f32-vhswish/f32-vhswish.h @@ -17,47 +17,47 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vhswish_ukernel__neon_u4, 4, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vhswish_ukernel__neon_u8, 8, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vhswish_ukernel__neon_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vhswish_ukernel__neon_u4, 4, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vhswish_ukernel__neon_u8, 8, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vhswish_ukernel__neon_u16, 16, false, float, struct xnn_f32_hswish_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vhswish_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_hswish_params, NULL) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__sse_u4, 4, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__sse_u8, 8, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u8, 8, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u8, 8, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__sse_u4, 4, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__sse_u8, 8, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u8, 8, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vhswish_ukernel__avx_u16, 16, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u8, 8, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vhswish_ukernel__fma3_u16, 16, false, float, struct xnn_f32_hswish_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vhswish_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_hswish_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_hswish_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasm_u1, 1, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasm_u2, 2, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasm_u4, 4, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasm_u1, 1, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasm_u2, 2, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__wasm_u4, 4, false, float, struct xnn_f32_hswish_params, NULL) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__scalar_u1, 1, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__scalar_u2, 2, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__scalar_u4, 4, false, float, struct xnn_f32_hswish_params, ((xnn_init_f32_hswish_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__scalar_u1, 1, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__scalar_u2, 2, false, float, struct xnn_f32_hswish_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vhswish_ukernel__scalar_u4, 4, false, float, struct xnn_f32_hswish_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vlog/f32-vlog.h b/src/f32-vlog/f32-vlog.h index c3b5c18673bb..2fd347906276 100644 --- a/src/f32-vlog/f32-vlog.h +++ b/src/f32-vlog/f32-vlog.h @@ -15,56 +15,56 @@ #define XNN_DEFINED_UKERNEL #endif -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_log_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_log_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_log_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_log_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_log_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_log_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__scalar_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__sse2_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u24, 24, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vlog_ukernel__avx2_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u24, 24, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u24, 24, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vlog_ukernel__fma3_rational_3_3_nr_u32, 32, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u48, 48, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_div_u64, 64, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u48, 48, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vlog_ukernel__avx512f_rational_3_3_nr_u64, 64, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vlog_ukernel__neon_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vlog_ukernel__wasmsimd_rational_3_3_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD diff --git a/src/f32-vneg/f32-vneg.h b/src/f32-vneg/f32-vneg.h index 491c2491e771..fe02e0d833dd 100644 --- a/src/f32-vneg/f32-vneg.h +++ b/src/f32-vneg/f32-vneg.h @@ -17,48 +17,48 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vneg_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vneg_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vneg_ukernel__neon_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vneg_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vneg_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vneg_ukernel__neon_u12, 12, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vneg_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__sse2_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vneg_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vneg_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vneg_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vneg_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vneg_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vneg_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vneg_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vneg_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__wasmsimd_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__wasmsimd_u12, 12, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vneg_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vrelu/f32-vrelu.h b/src/f32-vrelu/f32-vrelu.h index d6642db19ce9..2681ad220fd3 100644 --- a/src/f32-vrelu/f32-vrelu.h +++ b/src/f32-vrelu/f32-vrelu.h @@ -17,51 +17,51 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrelu_ukernel__neon_u4, 4, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrelu_ukernel__neon_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrelu_ukernel__neon_u4, 4, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrelu_ukernel__neon_u8, 8, false, float, struct xnn_f32_relu_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrelu_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_relu_params, NULL) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u4, 4, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u4, 4, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__sse_u8, 8, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u8, 8, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrelu_ukernel__avx_u16, 16, false, float, struct xnn_f32_relu_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrelu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_relu_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u1, 1, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u2, 2, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u4, 4, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u1, 1, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u2, 2, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u4, 4, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__scalar_u8, 8, false, float, struct xnn_f32_relu_params, NULL) #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u1, 1, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u2, 2, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u4, 4, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u1, 1, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u2, 2, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u4, 4, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm_u8, 8, false, float, struct xnn_f32_relu_params, NULL) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasmsimd_u16, 16, false, float, struct xnn_f32_relu_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm32_shr_u1, 1, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm32_shr_u2, 2, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm32_shr_u4, 4, false, float, struct xnn_f32_relu_params, ((xnn_init_f32_relu_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm32_shr_u1, 1, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm32_shr_u2, 2, false, float, struct xnn_f32_relu_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrelu_ukernel__wasm32_shr_u4, 4, false, float, struct xnn_f32_relu_params, NULL) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD diff --git a/src/f32-vrnd/f32-vrndd.h b/src/f32-vrnd/f32-vrndd.h index 87da6ddad847..5fe3c0340edc 100644 --- a/src/f32-vrnd/f32-vrndd.h +++ b/src/f32-vrnd/f32-vrndd.h @@ -17,41 +17,41 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndd_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndd_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndd_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndd_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndd_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndd_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndd_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndd_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndd_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndd_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndd_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndd_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndd_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vrnd/f32-vrndne.h b/src/f32-vrnd/f32-vrndne.h index 4cf05f41dcc1..75e62a55d087 100644 --- a/src/f32-vrnd/f32-vrndne.h +++ b/src/f32-vrnd/f32-vrndne.h @@ -17,41 +17,41 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndne_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndne_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndne_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndne_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndne_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndne_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndne_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndne_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndne_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndne_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndne_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndne_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndne_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vrnd/f32-vrndu.h b/src/f32-vrnd/f32-vrndu.h index efaeb9977bb4..795a21911750 100644 --- a/src/f32-vrnd/f32-vrndu.h +++ b/src/f32-vrnd/f32-vrndu.h @@ -17,41 +17,41 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndu_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndu_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndu_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndu_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndu_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndu_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndu_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndu_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndu_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndu_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndu_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndu_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndu_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vrnd/f32-vrndz.h b/src/f32-vrnd/f32-vrndz.h index 30fa23a9f6bf..0c5e4c5dc04d 100644 --- a/src/f32-vrnd/f32-vrndz.h +++ b/src/f32-vrnd/f32-vrndz.h @@ -17,41 +17,41 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndz_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndz_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndz_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndz_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndz_ukernel__neon_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrndz_ukernel__neon_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndz_ukernel__neonv8_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_v8, xnn_f32_vrndz_ukernel__neonv8_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrndz_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__sse2_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__sse2_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vrndz_ukernel__sse41_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrndz_ukernel__avx_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrndz_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_rnd_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, ((xnn_init_f32_rnd_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__scalar_libm_u1, 1, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__scalar_libm_u2, 2, false, float, struct xnn_f32_rnd_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrndz_ukernel__scalar_libm_u4, 4, false, float, struct xnn_f32_rnd_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vrsqrt/f32-vrsqrt.h b/src/f32-vrsqrt/f32-vrsqrt.h index 2e4e841e6aad..60cee282ea74 100644 --- a/src/f32-vrsqrt/f32-vrsqrt.h +++ b/src/f32-vrsqrt/f32-vrsqrt.h @@ -15,38 +15,38 @@ #define XNN_DEFINED_UKERNEL #endif -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__scalar_rsqrt_u1, 1, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__scalar_rsqrt_u2, 2, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__scalar_rsqrt_u4, 4, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__scalar_rsqrt_u1, 1, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__scalar_rsqrt_u2, 2, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__scalar_rsqrt_u4, 4, false, float, struct xnn_f32_rsqrt_params, NULL) #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsqrt_ukernel__rvv_rsqrt_u1v, 1, true, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsqrt_ukernel__rvv_rsqrt_u2v, 2, true, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsqrt_ukernel__rvv_rsqrt_u4v, 4, true, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsqrt_ukernel__rvv_rsqrt_u1v, 1, true, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsqrt_ukernel__rvv_rsqrt_u2v, 2, true, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vrsqrt_ukernel__rvv_rsqrt_u4v, 4, true, float, struct xnn_f32_rsqrt_params, NULL) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsqrt_ukernel__neon_rsqrt_u4, 4, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsqrt_ukernel__neon_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsqrt_ukernel__neon_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsqrt_ukernel__neon_rsqrt_u4, 4, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsqrt_ukernel__neon_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vrsqrt_ukernel__neon_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, 4, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, 4, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, 64, false, float, struct xnn_f32_rsqrt_params, ((xnn_init_f32_rsqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_rsqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, 64, false, float, struct xnn_f32_rsqrt_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/src/f32-vsigmoid/f32-vsigmoid.h b/src/f32-vsigmoid/f32-vsigmoid.h index 106a5dc25866..a380d06005df 100644 --- a/src/f32-vsigmoid/f32-vsigmoid.h +++ b/src/f32-vsigmoid/f32-vsigmoid.h @@ -17,292 +17,292 @@ #if XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_lut2048_p1_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__aarch64_neonfma_rr1_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) #endif // XNN_ARCH_ARM64 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_lut2048_p1_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsigmoid_ukernel__neon_rr2_p5_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr1recps1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr1recps1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut2048_p1_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr1recps1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon_fma, xnn_f32_vsigmoid_ukernel__neonfma_rr1_p5_nr2recps_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u40, 40, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u56, 56, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u72, 72, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u40, 40, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u56, 56, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u72, 72, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u40, 40, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u56, 56, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u72, 72, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u40, 40, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u56, 56, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u72, 72, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u40, 40, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u56, 56, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u72, 72, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__sse2_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_f32_vsigmoid_ukernel__sse41_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u40, 40, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u56, 56, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u72, 72, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u40, 40, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u56, 56, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u72, 72, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u40, 40, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u56, 56, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u72, 72, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u40, 40, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u56, 56, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u72, 72, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u40, 40, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u56, 56, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u72, 72, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_nr2fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u96, 96, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u112, 112, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u128, 128, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u96, 96, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u112, 112, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u128, 128, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u96, 96, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u112, 112, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u128, 128, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u96, 96, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u112, 112, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u128, 128, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u96, 96, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u112, 112, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u128, 128, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u96, 96, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u112, 112, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u128, 128, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u96, 96, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u112, 112, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_div_u128, 128, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u96, 96, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u112, 112, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_lut16_p3_perm_scalef_nr1fma_u128, 128, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u96, 96, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u112, 112, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_div_u128, 128, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u96, 96, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u112, 112, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr1_p5_scalef_nr1fma_u128, 128, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u96, 96, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u112, 112, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u128, 128, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u32, 32, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u48, 48, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u64, 64, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u80, 80, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u96, 96, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u112, 112, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_nr1fma_u128, 128, false, float, struct xnn_f32_sigmoid_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_fma_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_wasm_blendvps, xnn_f32_vsigmoid_ukernel__wasmblendvps_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_fma_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_lut64_p2_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u8, 8, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u12, 12, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u16, 16, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u20, 20, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__wasmrelaxedsimd_rr2_p5_div_u24, 24, false, float, struct xnn_f32_sigmoid_params, NULL) #endif // XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_u1, 1, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_u2, 2, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut2048_p1_div_u1, 1, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut2048_p1_div_u2, 2, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut2048_p1_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_p5_div_u1, 1, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_p5_div_u2, 2, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, ((xnn_init_f32_sigmoid_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_u1, 1, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_u2, 2, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut2048_p1_div_u1, 1, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut2048_p1_div_u2, 2, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_lut2048_p1_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_p5_div_u1, 1, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_p5_div_u2, 2, false, float, struct xnn_f32_sigmoid_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsigmoid_ukernel__scalar_rr2_p5_div_u4, 4, false, float, struct xnn_f32_sigmoid_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vsqr/f32-vsqr.h b/src/f32-vsqr/f32-vsqr.h index 2a8766856641..6710168cd575 100644 --- a/src/f32-vsqr/f32-vsqr.h +++ b/src/f32-vsqr/f32-vsqr.h @@ -17,48 +17,48 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqr_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqr_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqr_ukernel__neon_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqr_ukernel__neon_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqr_ukernel__neon_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqr_ukernel__neon_u12, 12, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u1v, 1, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u2v, 2, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u4v, 4, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqr_ukernel__rvv_u8v, 8, true, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__sse2_u12, 12, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqr_ukernel__avx_u24, 24, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u16, 16, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqr_ukernel__avx512f_u48, 48, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsqr_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsqr_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsqr_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsqr_ukernel__hvx_u32, 32, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsqr_ukernel__hvx_u64, 64, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_hvx, xnn_f32_vsqr_ukernel__hvx_u128, 128, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_HVX && (XNN_ARCH_HEXAGON) #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__wasmsimd_u12, 12, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__wasmsimd_u4, 4, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__wasmsimd_u8, 8, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__wasmsimd_u12, 12, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, ((xnn_init_f32_default_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__scalar_u1, 1, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__scalar_u2, 2, false, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqr_ukernel__scalar_u4, 4, false, float, struct xnn_f32_default_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vsqrt/f32-vsqrt.h b/src/f32-vsqrt/f32-vsqrt.h index f036b68c58f6..c534c0c79857 100644 --- a/src/f32-vsqrt/f32-vsqrt.h +++ b/src/f32-vsqrt/f32-vsqrt.h @@ -17,51 +17,51 @@ #if XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqrt_ukernel__aarch64_neon_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqrt_ukernel__aarch64_neon_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqrt_ukernel__aarch64_neon_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqrt_ukernel__aarch64_neon_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqrt_ukernel__aarch64_neon_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vsqrt_ukernel__aarch64_neon_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, NULL) #endif // XNN_ARCH_ARM64 #if XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u1v, 1, true, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u2v, 2, true, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u4v, 4, true, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u8v, 8, true, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u1v, 1, true, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u2v, 2, true, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u4v, 4, true, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_f32_vsqrt_ukernel__rvv_sqrt_u8v, 8, true, float, struct xnn_f32_sqrt_params, NULL) #endif // XNN_ENABLE_RISCV_VECTOR && (XNN_ARCH_RISCV) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, 12, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_sqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, 12, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_sqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, 48, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, 32, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, 48, false, float, struct xnn_f32_sqrt_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_u8, 8, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_u16, 16, false, float, struct xnn_f32_sqrt_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__scalar_sqrt_u1, 1, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__scalar_sqrt_u2, 2, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__scalar_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, ((xnn_init_f32_sqrt_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__scalar_sqrt_u1, 1, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__scalar_sqrt_u2, 2, false, float, struct xnn_f32_sqrt_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vsqrt_ukernel__scalar_sqrt_u4, 4, false, float, struct xnn_f32_sqrt_params, NULL) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/f32-vtanh/f32-vtanh.h b/src/f32-vtanh/f32-vtanh.h index 7db0a58f588a..e2be3caa00db 100644 --- a/src/f32-vtanh/f32-vtanh.h +++ b/src/f32-vtanh/f32-vtanh.h @@ -15,65 +15,65 @@ #define XNN_DEFINED_UKERNEL #endif -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u1, 1, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u2, 2, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u1, 1, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u2, 2, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__scalar_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, NULL) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u12, 12, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u4, 4, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u12, 12, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u24, 24, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u24, 24, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u24, 24, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u24, 24, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u12, 12, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u4, 4, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u12, 12, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__sse2_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u24, 24, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u24, 24, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx, xnn_f32_vtanh_ukernel__avx_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u24, 24, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u24, 24, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_fma3, xnn_f32_vtanh_ukernel__fma3_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512F && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u48, 48, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u64, 64, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u48, 48, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u64, 64, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u32, 32, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u48, 48, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_div_u64, 64, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u32, 32, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u48, 48, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512f, xnn_f32_vtanh_ukernel__avx512f_rational_9_8_nr_u64, 64, false, float, union xnn_f32_tanh_params, NULL) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u12, 12, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u12, 12, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(0, xnn_f32_vtanh_ukernel__wasmsimd_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u12, 12, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u4, 4, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u12, 12, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, ((xnn_init_f32_tanh_params_fn) NULL)) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u4, 4, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u12, 12, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_div_u16, 16, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u4, 4, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u8, 8, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u12, 12, false, float, union xnn_f32_tanh_params, NULL) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_f32_vtanh_ukernel__neon_rational_9_8_nr_u16, 16, false, float, union xnn_f32_tanh_params, NULL) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/src/microparams-init.c b/src/microparams-init.c index 97c49cce5273..e612ee2fe1f5 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -15,6 +15,7 @@ #include "xnnpack/common.h" #include "xnnpack/math.h" #include "xnnpack/microparams.h" +#include "xnnpack/requantization.h" #include "xnnpack/unaligned.h" size_t xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params( @@ -1464,136 +1465,142 @@ size_t xnn_init_f32_qb4w_minmax_scalar_params( } size_t xnn_init_qs8_hswish_scalar_params( - union xnn_qs8_hswish_params params[XNN_MIN_ELEMENTS(1)], - int16_t input_zero_point, - int16_t output_zero_point, - float input_scale, - float output_scale) -{ - params->scalar.input_zero_point = (uint32_t) input_zero_point; - params->scalar.output_zero_point= (int32_t) output_zero_point; + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + params->qs8_hswish.scalar.input_zero_point = (uint32_t) input_quantization->zero_point; + params->qs8_hswish.scalar.output_zero_point= (int32_t) output_quantization->zero_point; const float divisor1 = 0x1.555556p-10f; - const uint32_t input_scale_div = float_as_uint32(input_scale * divisor1); - params->scalar.input_scale_div_exp = (int32_t) (input_scale_div >> 23) - 126; - params->scalar.input_scale_div_mantissa = (int32_t) ((input_scale_div << 9) >> 18 | UINT16_C(0x4000)); - const float scale_ratio = input_scale / output_scale; + const uint32_t input_scale_div = float_as_uint32(input_quantization->scale * divisor1); + params->qs8_hswish.scalar.input_scale_div_exp = (int32_t) (input_scale_div >> 23) - 126; + params->qs8_hswish.scalar.input_scale_div_mantissa = (int32_t) ((input_scale_div << 9) >> 18 | UINT16_C(0x4000)); + const float scale_ratio = input_quantization->scale / output_quantization->scale; assert(scale_ratio >= 0x1.0p-8f); assert(scale_ratio < 0x1.0p+7f); - params->scalar.scale_ratio = (int32_t) lrintf(scale_ratio * 256.0f); - return sizeof(params->scalar); + params->qs8_hswish.scalar.scale_ratio = (int32_t) lrintf(scale_ratio * 256.0f); + return sizeof(params->qs8_hswish.scalar); } #if XNN_ARCH_X86 || XNN_ARCH_X86_64 size_t xnn_init_qs8_hswish_sse2_params( - union xnn_qs8_hswish_params params[XNN_MIN_ELEMENTS(1)], - int16_t input_zero_point, - int16_t output_zero_point, - float input_scale, - float output_scale) -{ - const int16_t input_scale_div = (int16_t) -lrintf(256.0f * input_scale / 6.0f); - const float scale_ratio = input_scale / output_scale; + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + const int16_t input_scale_div = (int16_t) -lrintf(256.0f * input_quantization->scale / 6.0f); + const float scale_ratio = input_quantization->scale / output_quantization->scale; assert(scale_ratio >= 0x1.0p-8f); assert(scale_ratio < 0x1.0p+7f); const int16_t scale_ratio_param = (int16_t) -lrintf(scale_ratio * 256.0f); - params->sse2.input_zero_point = input_zero_point; - params->sse2.output_zero_point = output_zero_point; - params->sse2.input_scale_div = input_scale_div; - params->sse2.scale_ratio = scale_ratio_param; - return sizeof(params->sse2); + params->qs8_hswish.sse2.input_zero_point = input_quantization->zero_point; + params->qs8_hswish.sse2.output_zero_point = output_quantization->zero_point; + params->qs8_hswish.sse2.input_scale_div = input_scale_div; + params->qs8_hswish.sse2.scale_ratio = scale_ratio_param; + return sizeof(params->qs8_hswish.sse2); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 size_t xnn_init_qu8_hswish_scalar_params( - union xnn_qu8_hswish_params params[XNN_MIN_ELEMENTS(1)], - int16_t input_zero_point, - int16_t output_zero_point, - float input_scale, - float output_scale) -{ - params->scalar.input_zero_point = (uint32_t) input_zero_point; - params->scalar.output_zero_point= (int32_t) output_zero_point; + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + params->qu8_hswish.scalar.input_zero_point = (uint32_t) input_quantization->zero_point; + params->qu8_hswish.scalar.output_zero_point= (int32_t) output_quantization->zero_point; const float divisor1 = 0x1.555556p-10f; - const uint32_t input_scale_div = float_as_uint32(input_scale * divisor1); - params->scalar.input_scale_div_exp = (int32_t) (input_scale_div >> 23) - 126; - params->scalar.input_scale_div_mantissa = (int32_t) ((input_scale_div << 9) >> 18 | UINT16_C(0x4000)); - const float scale_ratio = input_scale / output_scale; + const uint32_t input_scale_div = float_as_uint32(input_quantization->scale * divisor1); + params->qu8_hswish.scalar.input_scale_div_exp = (int32_t) (input_scale_div >> 23) - 126; + params->qu8_hswish.scalar.input_scale_div_mantissa = (int32_t) ((input_scale_div << 9) >> 18 | UINT16_C(0x4000)); + const float scale_ratio = input_quantization->scale / output_quantization->scale; assert(scale_ratio >= 0x1.0p-8f); assert(scale_ratio < 0x1.0p+7f); - params->scalar.scale_ratio = (int32_t) lrintf(scale_ratio * 256.0f); - return sizeof(params->scalar); + params->qu8_hswish.scalar.scale_ratio = (int32_t) lrintf(scale_ratio * 256.0f); + return sizeof(params->qu8_hswish.scalar); } #if XNN_ARCH_X86 || XNN_ARCH_X86_64 size_t xnn_init_qu8_hswish_sse2_params( - union xnn_qu8_hswish_params params[XNN_MIN_ELEMENTS(1)], - int16_t input_zero_point, - int16_t output_zero_point, - float input_scale, - float output_scale) -{ - const int16_t input_scale_div = (int16_t) -lrintf(256.0f * input_scale / 6.0f); - const float scale_ratio = input_scale / output_scale; + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + const int16_t input_scale_div = (int16_t) -lrintf(256.0f * input_quantization->scale / 6.0f); + const float scale_ratio = input_quantization->scale / output_quantization->scale; assert(scale_ratio >= 0x1.0p-8f); assert(scale_ratio < 0x1.0p+7f); const int16_t scale_ratio_param = (int16_t) -lrintf(scale_ratio * 256.0f); - params->sse2.input_zero_point = input_zero_point; - params->sse2.output_zero_point = output_zero_point; - params->sse2.input_scale_div = input_scale_div; - params->sse2.scale_ratio = scale_ratio_param; - return sizeof(params->sse2); + params->qu8_hswish.sse2.input_zero_point = input_quantization->zero_point; + params->qu8_hswish.sse2.output_zero_point = output_quantization->zero_point; + params->qu8_hswish.sse2.input_scale_div = input_scale_div; + params->qu8_hswish.sse2.scale_ratio = scale_ratio_param; + return sizeof(params->qu8_hswish.sse2); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 size_t xnn_init_f16_elu_scalar_params( - struct xnn_f16_elu_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 prescale, - xnn_float16 alpha, - xnn_float16 beta) -{ - params->scalar.prescale = prescale; - params->scalar.alpha = alpha; - params->scalar.beta = beta; - return sizeof(params->scalar); + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + params->f16_elu.scalar.prescale = xnn_float16_from_float(1.0f); + params->f16_elu.scalar.alpha = xnn_float16_from_float(op_params->elu.alpha); + params->f16_elu.scalar.beta = xnn_float16_from_float(1.0f); + return sizeof(params->f16_elu); } size_t xnn_init_f32_elu_scalar_params( - struct xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)], - float prescale, - float alpha, - float beta) -{ - params->scalar.prescale = prescale; - params->scalar.alpha = alpha; - params->scalar.beta = beta; - return sizeof(params->scalar); + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + params->f32_elu.scalar.prescale = 1.0f; + params->f32_elu.scalar.alpha = op_params->elu.alpha; + params->f32_elu.scalar.beta = 1.0f; + return sizeof(params->f32_elu); } size_t xnn_init_f16_lrelu_scalar_params( - struct xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 slope) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.slope = slope; - return sizeof(params->scalar); + params->f16_lrelu.scalar.slope = + xnn_float16_from_float(op_params->leaky_relu.negative_slope); + return sizeof(params->f16_lrelu); } size_t xnn_init_f32_lrelu_scalar_params( - struct xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float slope) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.slope = slope; - return sizeof(params->scalar); + params->f32_lrelu.scalar.slope = op_params->leaky_relu.negative_slope; + return sizeof(params->f32_lrelu); } size_t xnn_init_qs8_lrelu_scalar_params( - struct xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float positive_scale, - float negative_scale, - int8_t input_zero_point, - int8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { + const float negative_slope = op_params->leaky_relu.negative_slope; + const float input_scale = input_quantization->scale; + const float output_scale = output_quantization->scale; + const float positive_scale = input_scale / output_scale; + const float negative_scale = positive_scale * negative_slope; + assert(positive_scale >= 0x1.0p-8f); assert(positive_scale <= 0x1.0p+7f); assert(negative_scale <= 0x1.0p+7f); @@ -1607,20 +1614,25 @@ size_t xnn_init_qs8_lrelu_scalar_params( assert(negative_multiplier <= 32768L); assert(negative_multiplier >= -32767L); assert(negative_multiplier != 0L); - params->scalar.input_zero_point = input_zero_point; - params->scalar.positive_multiplier = positive_multiplier; - params->scalar.negative_multiplier = negative_multiplier; - params->scalar.output_zero_point = output_zero_point; - return sizeof(params->scalar); + params->qs8_lrelu.scalar.input_zero_point = input_quantization->zero_point; + params->qs8_lrelu.scalar.positive_multiplier = positive_multiplier; + params->qs8_lrelu.scalar.negative_multiplier = negative_multiplier; + params->qs8_lrelu.scalar.output_zero_point = output_quantization->zero_point; + return sizeof(params->qs8_lrelu); } size_t xnn_init_qu8_lrelu_scalar_params( - struct xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float positive_scale, - float negative_scale, - uint8_t input_zero_point, - uint8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { + const float negative_slope = op_params->leaky_relu.negative_slope; + const float input_scale = input_quantization->scale; + const float output_scale = output_quantization->scale; + const float positive_scale = input_scale / output_scale; + const float negative_scale = positive_scale * negative_slope; + assert(positive_scale >= 0x1.0p-8f); assert(positive_scale <= 0x1.0p+7f); assert(negative_scale <= 0x1.0p+7f); @@ -1634,11 +1646,59 @@ size_t xnn_init_qu8_lrelu_scalar_params( assert(negative_multiplier <= 32768L); assert(negative_multiplier >= -32767L); assert(negative_multiplier != 0L); - params->scalar.input_zero_point = input_zero_point; - params->scalar.positive_multiplier = positive_multiplier; - params->scalar.negative_multiplier = negative_multiplier; - params->scalar.output_zero_point = output_zero_point; - return sizeof(params->scalar); + params->qu8_lrelu.scalar.input_zero_point = input_quantization->zero_point; + params->qu8_lrelu.scalar.positive_multiplier = positive_multiplier; + params->qu8_lrelu.scalar.negative_multiplier = negative_multiplier; + params->qu8_lrelu.scalar.output_zero_point = output_quantization->zero_point; + return sizeof(params->qu8_lrelu); +} + +size_t xnn_init_qu8_clamp_scalar_params( + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + assert(input_quantization->scale == output_quantization->scale); + assert(input_quantization->zero_point == output_quantization->zero_point); + params->u8_minmax.scalar.min = xnn_qu8_quantize(op_params->clamp.min, output_quantization->scale, output_quantization->zero_point); + params->u8_minmax.scalar.max = xnn_qu8_quantize(op_params->clamp.max, output_quantization->scale, output_quantization->zero_point); + return sizeof(params->u8_minmax); +} + +size_t xnn_init_qs8_clamp_scalar_params( + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + assert(input_quantization->scale == output_quantization->scale); + assert(input_quantization->zero_point == output_quantization->zero_point); + params->s8_minmax.scalar.min = xnn_qs8_quantize(op_params->clamp.min, output_quantization->scale, output_quantization->zero_point); + params->s8_minmax.scalar.max = xnn_qs8_quantize(op_params->clamp.max, output_quantization->scale, output_quantization->zero_point); + return sizeof(params->s8_minmax); +} + +size_t xnn_init_f16_clamp_scalar_params( + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + params->f16_minmax.scalar.min = xnn_float16_from_float(op_params->clamp.min); + params->f16_minmax.scalar.max = xnn_float16_from_float(op_params->clamp.max); + return sizeof(params->f16_minmax); +} + +size_t xnn_init_f32_clamp_scalar_params( + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + params->f32_minmax.scalar.min = op_params->clamp.min; + params->f32_minmax.scalar.max = op_params->clamp.max; + return sizeof(params->f32_minmax); } size_t xnn_init_s8_minmax_scalar_params( @@ -1924,23 +1984,25 @@ size_t xnn_init_qs8_mul_minmax_rndnu_neon_params( #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 size_t xnn_init_f16_qs8_cvt_scalar_params( - struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 scale, - int8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.scale = scale; - params->scalar.output_zero_point = output_zero_point; - return sizeof(params->scalar); + params->f16_qs8_cvt.scalar.scale = xnn_float16_from_float(1.0f / output_quantization->scale); + params->f16_qs8_cvt.scalar.output_zero_point = output_quantization->zero_point; + return sizeof(params->f16_qs8_cvt); } size_t xnn_init_f32_qs8_cvt_scalar_params( - struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - int8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.scale = scale; - params->scalar.output_zero_point = (int16_t) output_zero_point; - return sizeof(params->scalar); + params->f32_qs8_cvt.scalar.scale = 1.0f / output_quantization->scale; + params->f32_qs8_cvt.scalar.output_zero_point = (int16_t) output_quantization->zero_point; + return sizeof(params->f32_qs8_cvt); } size_t xnn_init_qs8_reduce_minmax_scalar_params( @@ -1972,109 +2034,121 @@ size_t xnn_init_qu8_reduce_minmax_scalar_params( } size_t xnn_init_f32_qu8_cvt_scalar_params( - struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - uint8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.scale = scale; - params->scalar.output_zero_point = (int16_t) output_zero_point; - return sizeof(params->scalar); + params->f32_qu8_cvt.scalar.scale = 1.0f / output_quantization->scale; + params->f32_qu8_cvt.scalar.output_zero_point = (int16_t) output_quantization->zero_point; + return sizeof(params->f32_qu8_cvt); } size_t xnn_init_s32_f32_cvt_scalar_params( - struct xnn_s32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - int32_t zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.zero_point = zero_point; - return sizeof(params->scalar); + params->s32_f32_cvt.scalar.zero_point = input_quantization->zero_point; + return sizeof(params->s32_f32_cvt); } size_t xnn_init_u32_f32_cvt_scalar_params( - struct xnn_u32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - int32_t zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.zero_point = zero_point; - return sizeof(params->scalar); + params->u32_f32_cvt.scalar.zero_point = input_quantization->zero_point; + return sizeof(params->u32_f32_cvt); } size_t xnn_init_qs8_cvt_scalar_params( - struct xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - int8_t input_zero_point, - int8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { + const float input_output_scale = input_quantization->scale / output_quantization->scale; assert(input_output_scale >= 0x1.0p-8); assert(input_output_scale <= 0x1.0p+7); const long multiplier = lrintf(256.0f * input_output_scale); assert(multiplier >= 1L); assert(multiplier <= 32768L); - params->scalar.input_zero_point = (int16_t) input_zero_point; - params->scalar.multiplier = (int32_t) multiplier; - params->scalar.output_zero_point = (int16_t) output_zero_point; - return sizeof(params->scalar); + params->qs8_cvt.scalar.input_zero_point = (int16_t) input_quantization->zero_point; + params->qs8_cvt.scalar.multiplier = (int32_t) multiplier; + params->qs8_cvt.scalar.output_zero_point = (int16_t) output_quantization->zero_point; + return sizeof(params->qs8_cvt); } size_t xnn_init_qs16_qs8_cvt_scalar_params( - struct xnn_qs16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - int8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { + const float input_output_scale = input_quantization->scale / output_quantization->scale; assert(input_output_scale >= 0x1.0p-16); assert(input_output_scale <= 0x1.0p+8); const long multiplier = lrintf(65536.0f * input_output_scale); assert(multiplier >= 1L); assert(multiplier <= 0x01000000L); - params->scalar.multiplier = (int32_t) multiplier; - params->scalar.output_zero_point = (int32_t) output_zero_point; - return sizeof(params->scalar); + params->qs16_qs8_cvt.scalar.multiplier = (int32_t) multiplier; + params->qs16_qs8_cvt.scalar.output_zero_point = (int32_t) output_quantization->zero_point; + return sizeof(params->qs16_qs8_cvt); } size_t xnn_init_qs8_f32_cvt_scalar_params( - struct xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - int8_t zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.zero_point = (int32_t) zero_point; - params->scalar.scale = scale; - return sizeof(params->scalar); + params->qs8_f32_cvt.scalar.zero_point = (int32_t) input_quantization->zero_point; + params->qs8_f32_cvt.scalar.scale = input_quantization->scale; + return sizeof(params->qs8_f32_cvt); } size_t xnn_init_qs8_f16_cvt_scalar_params( - struct xnn_qs8_f16_cvt_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 scale, - int8_t zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { - params->scalar.zero_point = (int16_t) zero_point; - params->scalar.scale = scale; - return sizeof(params->scalar); + params->qs8_f16_cvt.scalar.zero_point = (int16_t) input_quantization->zero_point; + params->qs8_f16_cvt.scalar.scale = xnn_float16_from_float(input_quantization->scale); + return sizeof(params->qs8_f16_cvt); } size_t xnn_init_qu8_cvt_scalar_params( - struct xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - uint8_t input_zero_point, - uint8_t output_zero_point) + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { + const float input_output_scale = input_quantization->scale / output_quantization->scale; assert(input_output_scale >= 0x1.0p-8); assert(input_output_scale <= 0x1.0p+7); const long multiplier = lrintf(256.0f * input_output_scale); assert(multiplier >= 1L); assert(multiplier <= 32768L); - params->scalar.input_zero_point = (uint16_t) input_zero_point; - params->scalar.multiplier = (int32_t) multiplier; - params->scalar.output_zero_point = (int16_t) output_zero_point; - return sizeof(params->scalar); + params->qu8_cvt.scalar.input_zero_point = (uint16_t) input_quantization->zero_point; + params->qu8_cvt.scalar.multiplier = (int32_t) multiplier; + params->qu8_cvt.scalar.output_zero_point = (int16_t) output_quantization->zero_point; + return sizeof(params->qu8_cvt); } size_t xnn_init_qu8_f32_cvt_scalar_params( - struct xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - uint8_t zero_point) -{ - params->scalar.zero_point = (int32_t) zero_point; - params->scalar.scale = scale; - return sizeof(params->scalar); + union xnn_unary_uparams* params, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) +{ + params->qu8_f32_cvt.scalar.zero_point = (int32_t) input_quantization->zero_point; + params->qu8_f32_cvt.scalar.scale = input_quantization->scale; + return sizeof(params->qu8_f32_cvt); } diff --git a/src/operator-delete.c b/src/operator-delete.c index d43fa96666df..36ce053a2b8a 100644 --- a/src/operator-delete.c +++ b/src/operator-delete.c @@ -12,34 +12,16 @@ #include "xnnpack/allocator.h" #include "xnnpack/log.h" #include "xnnpack/operator.h" +#include "xnnpack/operator-utils.h" #include "xnnpack/params.h" enum xnn_status xnn_delete_operator(xnn_operator_t op) { - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to delete operator: XNNPACK is not initialized"); - return xnn_status_uninitialized; + enum xnn_status status = xnn_destroy_operator(op); + if (status != xnn_status_success) { + return status; } - - if (op == NULL) { - return xnn_status_invalid_parameter; - } - - xnn_release_memory(op->indirection_buffer); - if (op->weights_cache == NULL) { - xnn_release_simd_memory(op->packed_weights.pointer); - } - xnn_release_simd_memory(op->zero_buffer); - if (op->zero_buffers) { - for (size_t i = 1; i < op->batch_size; ++i) { - xnn_release_simd_memory(op->zero_buffers[i]); - } - xnn_release_memory(op->zero_buffers); - } - xnn_release_memory(op->pixelwise_buffer); - xnn_release_memory(op->subconvolution_buffer); - xnn_release_simd_memory(op->lookup_table); xnn_release_simd_memory(op); - return xnn_status_success; + return status; } diff --git a/src/operator-run.c b/src/operator-run.c index 7bd27ebddec2..c665c50a706a 100644 --- a/src/operator-run.c +++ b/src/operator-run.c @@ -2093,22 +2093,24 @@ void xnn_compute_contiguous_reduce( if (context->s32_f32_cvt_ukernel) { struct xnn_s32_f32_cvt_params s32_f32_cvt_params; - xnn_init_s32_f32_cvt_scalar_params(&s32_f32_cvt_params, context->params.qs8_mean.scalar.num_elements * (int32_t) context->params.qs8_mean.scalar.input_zero_point); + s32_f32_cvt_params.scalar.zero_point = context->params.qs8_mean.scalar.num_elements * (int32_t) context->params.qs8_mean.scalar.input_zero_point; context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - workspace_ptr, /*params=*/&s32_f32_cvt_params); + workspace_ptr, (union xnn_unary_uparams*) &s32_f32_cvt_params); struct xnn_f32_qs8_cvt_params cvt_params; - xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point); + cvt_params.scalar.scale = context->params.qs8_mean.scalar.scale; + cvt_params.scalar.output_zero_point = context->params.qs8_mean.scalar.output_zero_point; context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - output_ptr, /*params=*/&cvt_params); + output_ptr, (union xnn_unary_uparams*) &cvt_params); } else if (context->u32_f32_cvt_ukernel) { struct xnn_u32_f32_cvt_params u32_f32_cvt_params; - xnn_init_u32_f32_cvt_scalar_params(&u32_f32_cvt_params, context->params.qu8_mean.scalar.num_elements * (int32_t) context->params.qu8_mean.scalar.input_zero_point); + u32_f32_cvt_params.scalar.zero_point = context->params.qu8_mean.scalar.num_elements * (int32_t) context->params.qu8_mean.scalar.input_zero_point; context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - workspace_ptr, /*params=*/&u32_f32_cvt_params); + workspace_ptr, (union xnn_unary_uparams*) &u32_f32_cvt_params); struct xnn_f32_qu8_cvt_params cvt_params; - xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point); + cvt_params.scalar.scale = context->params.qu8_mean.scalar.scale; + cvt_params.scalar.output_zero_point = context->params.qu8_mean.scalar.output_zero_point; context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - output_ptr, /*params=*/&cvt_params); + output_ptr, (union xnn_unary_uparams*) &cvt_params); } else { context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/NULL); } @@ -2174,22 +2176,24 @@ void xnn_compute_discontiguous_reduce( if (context->s32_f32_cvt_ukernel) { struct xnn_s32_f32_cvt_params s32_f32_cvt_params; - xnn_init_s32_f32_cvt_scalar_params(&s32_f32_cvt_params, context->params.qs8_mean.scalar.num_elements * (int32_t) context->params.qs8_mean.scalar.input_zero_point); + s32_f32_cvt_params.scalar.zero_point = context->params.qs8_mean.scalar.num_elements * (int32_t) context->params.qs8_mean.scalar.input_zero_point; context->s32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - workspace_ptr, /*params=*/&s32_f32_cvt_params); + workspace_ptr, (union xnn_unary_uparams*) &s32_f32_cvt_params); struct xnn_f32_qs8_cvt_params cvt_params; - xnn_init_f32_qs8_cvt_scalar_params(&cvt_params, context->params.qs8_mean.scalar.scale, context->params.qs8_mean.scalar.output_zero_point); + cvt_params.scalar.scale = context->params.qs8_mean.scalar.scale; + cvt_params.scalar.output_zero_point = context->params.qs8_mean.scalar.output_zero_point; context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - output_ptr, /*params=*/&cvt_params); + output_ptr, (union xnn_unary_uparams*) &cvt_params); } else if (context->u32_f32_cvt_ukernel) { struct xnn_u32_f32_cvt_params u32_f32_cvt_params; - xnn_init_u32_f32_cvt_scalar_params(&u32_f32_cvt_params, context->params.qu8_mean.scalar.num_elements * (int32_t) context->params.qu8_mean.scalar.input_zero_point); + u32_f32_cvt_params.scalar.zero_point = context->params.qu8_mean.scalar.num_elements * (int32_t) context->params.qu8_mean.scalar.input_zero_point; context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - workspace_ptr, /*params=*/&u32_f32_cvt_params); + workspace_ptr, (union xnn_unary_uparams*) &u32_f32_cvt_params); struct xnn_f32_qu8_cvt_params cvt_params; - xnn_init_f32_qu8_cvt_scalar_params(&cvt_params, context->params.qu8_mean.scalar.scale, context->params.qu8_mean.scalar.output_zero_point); + cvt_params.scalar.scale = context->params.qu8_mean.scalar.scale; + cvt_params.scalar.output_zero_point = context->params.qu8_mean.scalar.output_zero_point; context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, - output_ptr, /*params=*/&cvt_params); + output_ptr, (union xnn_unary_uparams*) &cvt_params); } else { context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr, output_ptr, /*params=*/NULL); } @@ -2223,8 +2227,9 @@ void xnn_compute_f16_qd8_convert( context->quantization_params[batch_index] = xnn_f16_qd8_asymmetric_quantization_params(minmax[0], minmax[1], &f16_scale); struct xnn_f16_qs8_cvt_params params; - context->init_params(¶ms, f16_scale, context->quantization_params[batch_index].zero_point); - context->convert_ukernel(n, input, output, ¶ms); + params.scalar.scale = f16_scale; + params.scalar.output_zero_point = context->quantization_params[batch_index].zero_point; + context->convert_ukernel(n, input, output, (union xnn_unary_uparams*) ¶ms); } void xnn_compute_f32_qd8_convert( @@ -2242,8 +2247,9 @@ void xnn_compute_f32_qd8_convert( context->quantization_params[batch_index] = xnn_f32_qd8_asymmetric_quantization_params(minmax[0], minmax[1]); struct xnn_f32_qs8_cvt_params params; - context->init_params(¶ms, 1.0f / context->quantization_params[batch_index].inv_scale, context->quantization_params[batch_index].zero_point); - context->convert_ukernel(n, input, output, ¶ms); + params.scalar.scale = 1.0f / context->quantization_params[batch_index].inv_scale; + params.scalar.output_zero_point = context->quantization_params[batch_index].zero_point; + context->convert_ukernel(n, input, output, (union xnn_unary_uparams*) ¶ms); } void xnn_compute_x32_pack_lh( diff --git a/src/operator-utils.c b/src/operator-utils.c index 9c11701181b4..72b0a105ea3f 100644 --- a/src/operator-utils.c +++ b/src/operator-utils.c @@ -148,6 +148,81 @@ uint32_t xnn_get_heuristic_mr_igemm( return best_mr; } +enum xnn_status xnn_destroy_operator(xnn_operator_t op) +{ + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error("failed to delete operator: XNNPACK is not initialized"); + return xnn_status_uninitialized; + } + + if (op == NULL) { + return xnn_status_invalid_parameter; + } + + xnn_release_memory(op->indirection_buffer); + if (op->weights_cache == NULL) { + xnn_release_simd_memory(op->packed_weights.pointer); + } + xnn_release_simd_memory(op->zero_buffer); + if (op->zero_buffers) { + for (size_t i = 1; i < op->batch_size; ++i) { + xnn_release_simd_memory(op->zero_buffers[i]); + } + xnn_release_memory(op->zero_buffers); + } + xnn_release_memory(op->pixelwise_buffer); + xnn_release_memory(op->subconvolution_buffer); + xnn_release_simd_memory(op->lookup_table); + return xnn_status_success; +} + +enum xnn_operator_type xnn_unary_operator_to_operator_type(enum xnn_unary_operator op) { + switch (op) { + case xnn_unary_abs: + return xnn_operator_type_abs; + case xnn_unary_bankers_rounding: + return xnn_operator_type_bankers_rounding; + case xnn_unary_ceiling: + return xnn_operator_type_ceiling; + case xnn_unary_clamp: + return xnn_operator_type_clamp; + case xnn_unary_convert: + return xnn_operator_type_convert; + case xnn_unary_elu: + return xnn_operator_type_elu; + case xnn_unary_exp: + return xnn_operator_type_exp; + case xnn_unary_floor: + return xnn_operator_type_floor; + case xnn_unary_gelu: + return xnn_operator_type_gelu; + case xnn_unary_hardswish: + return xnn_operator_type_hardswish; + case xnn_unary_leaky_relu: + return xnn_operator_type_leaky_relu; + case xnn_unary_log: + return xnn_operator_type_log; + case xnn_unary_negate: + return xnn_operator_type_negate; + case xnn_unary_reciprocal_square_root: + return xnn_operator_type_reciprocal_square_root; + case xnn_unary_sigmoid: + return xnn_operator_type_sigmoid; + case xnn_unary_square: + return xnn_operator_type_square; + case xnn_unary_square_root: + return xnn_operator_type_square_root; + case xnn_unary_tanh: + return xnn_operator_type_tanh; + default: + return xnn_operator_type_invalid; + } +} + +const char* xnn_unary_operator_to_string(enum xnn_unary_operator op) { + return xnn_operator_type_to_string(xnn_unary_operator_to_operator_type(op)); +} + enum xnn_operator_type xnn_binary_operator_to_operator_type(enum xnn_binary_operator op) { switch (op) { diff --git a/src/operators/lut-elementwise-nc.c b/src/operators/lut-elementwise-nc.c deleted file mode 100644 index 9e9138454eba..000000000000 --- a/src/operators/lut-elementwise-nc.c +++ /dev/null @@ -1,559 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/compute.h" -#include "xnnpack/config-types.h" -#include "xnnpack/config.h" -#include "xnnpack/log.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/params.h" -#include "pthreadpool.h" - -static bool is_continugous(xnn_operator_t lut_elementwise_op) -{ - const size_t channels = lut_elementwise_op->channels; - const size_t input_stride = lut_elementwise_op->input_pixel_stride; - const size_t output_stride = lut_elementwise_op->output_pixel_stride; - const size_t batch_size = lut_elementwise_op->batch_size; - return (((input_stride ^ channels) | (output_stride ^ channels)) == 0) || batch_size == 1; -} - -typedef float (*xnn_lut_init_fn)(float, const void*); - -static enum xnn_status create_lut_elementwise_nc( - int32_t input_zero_point, - float input_scale, - int32_t input_min, - long output_zero_point, - float output_scale, - long output_min, - long output_max, - uint32_t flags, - xnn_lut_init_fn init_fn, - const void* init_params, - enum xnn_operator_type operator_type, - xnn_operator_t* lut_elementwise_op_out) -{ - xnn_operator_t lut_elementwise_op = NULL; - enum xnn_status status = xnn_status_uninitialized; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to create %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(operator_type)); - goto error; - } - - status = xnn_status_invalid_parameter; - - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(operator_type), input_scale); - goto error; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(operator_type), output_scale); - goto error; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%ld, %ld] output range: range min must be less than or equal to range max", - xnn_operator_type_to_string(operator_type), output_min, output_max); - goto error; - } - - const struct xnn_x8_lut_config* lut_config = xnn_init_x8_lut_config(); - assert(lut_config != NULL); - - status = xnn_status_out_of_memory; - - lut_elementwise_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); - if (lut_elementwise_op == NULL) { - xnn_log_error( - "failed to allocate %zu bytes for %s operator descriptor", - sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type)); - goto error; - } - - lut_elementwise_op->lookup_table = xnn_allocate_simd_memory(256 * sizeof(uint8_t)); - if (lut_elementwise_op->lookup_table == NULL) { - xnn_log_error( - "failed to allocate 256 bytes for %s operator lookup table", - xnn_operator_type_to_string(operator_type)); - goto error; - } - - uint8_t* lookup_table = lut_elementwise_op->lookup_table; - const float inv_output_scale = 1.0f / output_scale; - for (int32_t i = input_min; i < input_min + 256; i++) { - const float dequantized_input = (i - input_zero_point) * input_scale; - const float dequantized_output = init_fn(dequantized_input, init_params); - long quantized_output = lrintf(dequantized_output * inv_output_scale) + output_zero_point; - quantized_output = XNN_UNPREDICTABLE(quantized_output < output_min) ? output_min : quantized_output; - quantized_output = XNN_UNPREDICTABLE(quantized_output > output_max) ? output_max : quantized_output; - lookup_table[(uint8_t) i] = (uint8_t) quantized_output; - } - - lut_elementwise_op->type = operator_type; - lut_elementwise_op->flags = flags; - lut_elementwise_op->lut_config = lut_config; - - lut_elementwise_op->state = xnn_run_state_invalid; - - *lut_elementwise_op_out = lut_elementwise_op; - return xnn_status_success; - -error: - xnn_delete_operator(lut_elementwise_op); - return status; -} - -static float calculate_elu(float x, const float* alpha_ptr) { - const float alpha = *alpha_ptr; - return signbit(x) ? alpha * expm1f(x) : x; -} - -enum xnn_status xnn_create_elu_nc_qs8( - float alpha, - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* elu_op_out) -{ - if (alpha <= 0.0f || !isnormal(alpha)) { - xnn_log_error( - "failed to create %s operator with %.7g alpha parameter: alpha must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_elu_nc_qs8), alpha); - return xnn_status_invalid_parameter; - } - - return create_lut_elementwise_nc( - (int32_t) input_zero_point, input_scale, INT8_MIN, - (long) output_zero_point, output_scale, - (long) output_min, (long) output_max, - flags, - (xnn_lut_init_fn) &calculate_elu, &alpha, - xnn_operator_type_elu_nc_qs8, elu_op_out); -} - -static float calculate_sigmoid(float x, const void* params) { - return signbit(x) ? 1.0f / (1.0f + expf(-x)) : 1.0f - 1.0f / (1.0f + expf(x)); -} - -enum xnn_status xnn_create_sigmoid_nc_qs8( - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* sigmoid_op_out) -{ - if (output_scale != 0x1.0p-8f) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: only output scale of 1/256 is supported", - xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_qs8), output_scale); - return xnn_status_unsupported_parameter; - } - - if (output_zero_point != -128) { - xnn_log_error( - "failed to create %s operator with %" PRIu8 " output zero point: only output zero point of -128 is supported", - xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_qs8), output_zero_point); - return xnn_status_unsupported_parameter; - } - - return create_lut_elementwise_nc( - (int32_t) input_zero_point, input_scale, INT8_MIN, - (long) output_zero_point, output_scale, - (long) output_min, (long) output_max, - flags, - (xnn_lut_init_fn) &calculate_sigmoid, NULL, - xnn_operator_type_sigmoid_nc_qs8, sigmoid_op_out); -} - -enum xnn_status xnn_create_sigmoid_nc_qu8( - uint8_t input_zero_point, - float input_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* sigmoid_op_out) -{ - if (output_scale != 0x1.0p-8f) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: only output scale of 1/256 is supported", - xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_qu8), output_scale); - return xnn_status_unsupported_parameter; - } - - if (output_zero_point != 0) { - xnn_log_error( - "failed to create %s operator with %" PRIu8 " output zero point: only output zero point of 0 is supported", - xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_qu8), output_zero_point); - return xnn_status_unsupported_parameter; - } - - return create_lut_elementwise_nc( - (int32_t) (uint32_t) input_zero_point, input_scale, 0 /* input min */, - (long) (unsigned long) output_zero_point, output_scale, - (long) (unsigned long) output_min, (long) (unsigned long) output_max, - flags, - (xnn_lut_init_fn) &calculate_sigmoid, NULL, - xnn_operator_type_sigmoid_nc_qu8, sigmoid_op_out); -} - -static float calculate_tanh(float x, const void* params) { - return tanhf(x); -} - -enum xnn_status xnn_create_tanh_nc_qs8( - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* tanh_op_out) -{ - if (output_scale != 0x1.0p-7f) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: only output scale of 1/128 is supported", - xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qs8), output_scale); - return xnn_status_unsupported_parameter; - } - - if (output_zero_point != 0) { - xnn_log_error( - "failed to create %s operator with %" PRIu8 " output zero point: only output zero point of 0 is supported", - xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qs8), output_zero_point); - return xnn_status_unsupported_parameter; - } - - return create_lut_elementwise_nc( - (int32_t) input_zero_point, input_scale, INT8_MIN, - (long) output_zero_point, output_scale, - (long) output_min, (long) output_max, - flags, - (xnn_lut_init_fn) &calculate_tanh, NULL, - xnn_operator_type_tanh_nc_qs8, tanh_op_out); -} - -enum xnn_status xnn_create_tanh_nc_qu8( - uint8_t input_zero_point, - float input_scale, - uint8_t output_zero_point, - float output_scale, - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* tanh_op_out) -{ - if (output_scale != 0x1.0p-7f) { - xnn_log_error( - "failed to create %s operator with %.7g output scale: only output scale of 1/128 is supported", - xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qu8), output_scale); - return xnn_status_unsupported_parameter; - } - - if (output_zero_point != 128) { - xnn_log_error( - "failed to create %s operator with %" PRIu8 " output zero point: only output zero point of 128 is supported", - xnn_operator_type_to_string(xnn_operator_type_tanh_nc_qu8), output_zero_point); - return xnn_status_unsupported_parameter; - } - - return create_lut_elementwise_nc( - (int32_t) (uint32_t) input_zero_point, input_scale, 0 /* input min */, - (long) (unsigned long) output_zero_point, output_scale, - (long) (unsigned long) output_min, (long) (unsigned long) output_max, - flags, - (xnn_lut_init_fn) &calculate_tanh, NULL, - xnn_operator_type_tanh_nc_qu8, tanh_op_out); -} - -static enum xnn_status reshape_lut_elementwise_nc( - xnn_operator_t lut_elementwise_op, - enum xnn_operator_type expected_operator_type, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - if (lut_elementwise_op->type != expected_operator_type) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(expected_operator_type), - xnn_operator_type_to_string(lut_elementwise_op->type)); - return xnn_status_invalid_parameter; - } - if (channels == 0) { - xnn_log_error( - "failed to create %s operator with %zu channels: number of channels must be non-zero", - xnn_operator_type_to_string(lut_elementwise_op->type), channels); - return xnn_status_invalid_parameter; - } - - if (input_stride < channels) { - xnn_log_error( - "failed to create %s operator with input element stride of %zu: " - "stride must be at least as large as the number of channels (%zu)", - xnn_operator_type_to_string(lut_elementwise_op->type), input_stride, channels); - return xnn_status_invalid_parameter; - } - - if (output_stride < channels) { - xnn_log_error( - "failed to create %s operator with output element stride of %zu: " - "stride must be at least as large as the number of channels (%zu)", - xnn_operator_type_to_string(lut_elementwise_op->type), output_stride, channels); - return xnn_status_invalid_parameter; - } - - lut_elementwise_op->state = xnn_run_state_invalid; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error( - "failed to setup %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(expected_operator_type)); - return xnn_status_uninitialized; - } - - if (batch_size == 0) { - lut_elementwise_op->state = xnn_run_state_skip; - return xnn_status_success; - } - - lut_elementwise_op->batch_size = batch_size; - - const struct xnn_x8_lut_config* lut_config = lut_elementwise_op->lut_config; - - lut_elementwise_op->channels = channels; - lut_elementwise_op->input_pixel_stride = input_stride; - lut_elementwise_op->output_pixel_stride = output_stride; - - if (is_continugous(lut_elementwise_op)) { - lut_elementwise_op->context.lut_contiguous = (struct lut_contiguous_context) { - .x_stride = input_stride * sizeof(uint8_t), - .t = lut_elementwise_op->lookup_table, - .y_stride = output_stride * sizeof(uint8_t), - .ukernel = lut_config->microkernel, - }; - - const size_t range = batch_size * channels * sizeof(uint8_t); - size_t tile = range; - if (pthreadpool_get_threads_count(threadpool) > 1) { - const size_t block_size = 1024; - tile = block_size * sizeof(uint8_t); - } - - lut_elementwise_op->compute[0].type = xnn_parallelization_type_1d_tile_1d; - lut_elementwise_op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_lut_contiguous; - lut_elementwise_op->compute[0].range[0] = range; - lut_elementwise_op->compute[0].tile[0] = tile; - } else { - lut_elementwise_op->context.lut_strided = (struct lut_strided_context) { - .n = channels * sizeof(uint8_t), - .x_stride = input_stride * sizeof(uint8_t), - .t = lut_elementwise_op->lookup_table, - .y_stride = output_stride * sizeof(uint8_t), - .ukernel = lut_config->microkernel, - }; - lut_elementwise_op->compute[0].type = xnn_parallelization_type_1d; - lut_elementwise_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_lut_strided; - lut_elementwise_op->compute[0].range[0] = batch_size; - } - lut_elementwise_op->state = xnn_run_state_needs_setup; - - return xnn_status_success; -} - -enum xnn_status xnn_reshape_elu_nc_qs8( - xnn_operator_t elu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_lut_elementwise_nc( - elu_op, xnn_operator_type_elu_nc_qs8, - batch_size, - channels, input_stride, output_stride, - threadpool); -} - -enum xnn_status xnn_reshape_sigmoid_nc_qs8( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_lut_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_qs8, - batch_size, - channels, input_stride, output_stride, - threadpool); -} - -enum xnn_status xnn_reshape_sigmoid_nc_qu8( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_lut_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_qu8, - batch_size, - channels, input_stride, output_stride, - threadpool); -} - -enum xnn_status xnn_reshape_tanh_nc_qs8( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_lut_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_qs8, - batch_size, - channels, input_stride, output_stride, - threadpool); -} - -enum xnn_status xnn_reshape_tanh_nc_qu8( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_lut_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_qu8, - batch_size, - channels, input_stride, output_stride, - threadpool); -} - -static enum xnn_status setup_lut_elementwise_nc( - xnn_operator_t lut_elementwise_op, - enum xnn_operator_type expected_operator_type, - const void* input, - void* output) -{ - if (lut_elementwise_op->type != expected_operator_type) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(expected_operator_type), - xnn_operator_type_to_string(lut_elementwise_op->type)); - return xnn_status_invalid_parameter; - } - - switch (lut_elementwise_op->state) { - case xnn_run_state_skip: - return xnn_status_success; - case xnn_run_state_invalid: - xnn_log_error( - "failed to setup %s operator: operator has not been reshaped yet", - xnn_operator_type_to_string(lut_elementwise_op->type)); - return xnn_status_invalid_state; - case xnn_run_state_needs_setup: - // Operator has been reshaped, but not setup, continue with setup. - case xnn_run_state_ready: - // Operator has been reshaped, and we are setting up with different pointers. - break; - } - - if (is_continugous(lut_elementwise_op)) { - lut_elementwise_op->context.lut_contiguous.x = input; - lut_elementwise_op->context.lut_contiguous.y = output; - } else { - lut_elementwise_op->context.lut_strided.x = input; - lut_elementwise_op->context.lut_strided.y = output; - } - lut_elementwise_op->state = xnn_run_state_ready; - - return xnn_status_success; -} - -enum xnn_status xnn_setup_elu_nc_qs8( - xnn_operator_t elu_op, - const int8_t* input, - int8_t* output) -{ - return setup_lut_elementwise_nc( - elu_op, xnn_operator_type_elu_nc_qs8, - input, output); -} - -enum xnn_status xnn_setup_sigmoid_nc_qs8( - xnn_operator_t sigmoid_op, - const int8_t* input, - int8_t* output) -{ - return setup_lut_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_qs8, - input, output); -} - -enum xnn_status xnn_setup_sigmoid_nc_qu8( - xnn_operator_t sigmoid_op, - const uint8_t* input, - uint8_t* output) -{ - return setup_lut_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_qu8, - input, output); -} - -enum xnn_status xnn_setup_tanh_nc_qs8( - xnn_operator_t tanh_op, - const int8_t* input, - int8_t* output) -{ - return setup_lut_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_qs8, - input, output); -} - -enum xnn_status xnn_setup_tanh_nc_qu8( - xnn_operator_t tanh_op, - const uint8_t* input, - uint8_t* output) -{ - return setup_lut_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_qu8, - input, output); -} diff --git a/src/operators/scaled-dot-product-attention-nhtc.c b/src/operators/scaled-dot-product-attention-nhtc.c index a278c7c044f5..b1ae6ea6bb12 100644 --- a/src/operators/scaled-dot-product-attention-nhtc.c +++ b/src/operators/scaled-dot-product-attention-nhtc.c @@ -184,9 +184,9 @@ enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f16( goto error; } - union xnn_f16_tanh_params tanh_params; - if XNN_LIKELY(vtanh_config->init.f16_tanh != NULL) { - vtanh_config->init.f16_tanh(&tanh_params); + union xnn_unary_uparams tanh_params; + if XNN_LIKELY(vtanh_config->init != NULL) { + vtanh_config->init(&tanh_params, NULL, NULL, NULL); } status = xnn_status_invalid_parameter; @@ -294,9 +294,9 @@ enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f32( goto error; } - union xnn_f32_tanh_params tanh_params; - if XNN_LIKELY(vtanh_config->init.f32_tanh != NULL) { - vtanh_config->init.f32_tanh(&tanh_params); + union xnn_unary_uparams tanh_params; + if XNN_LIKELY(vtanh_config->init != NULL) { + vtanh_config->init(&tanh_params, NULL, NULL, NULL); } status = xnn_status_invalid_parameter; @@ -655,7 +655,7 @@ enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f16( &attention_op->params.f16_minmax, sizeof(attention_op->params.f16_minmax), &attention_op->params2.f16_expminus_params, sizeof(attention_op->params2.f16_expminus_params), &attention_op->params3.f16_rmax, sizeof(attention_op->params3.f16_rmax), - &attention_op->params4.f16_tanh, sizeof(attention_op->params4.f16_tanh), + &attention_op->params4.unary, sizeof(attention_op->params4.unary), threadpool); } @@ -693,7 +693,7 @@ enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f32( &attention_op->params.f32_minmax, sizeof(attention_op->params.f32_minmax), &attention_op->params2.f32_expminus_params, sizeof(attention_op->params2.f32_expminus_params), &attention_op->params3.f32_rmax, sizeof(attention_op->params3.f32_rmax), - &attention_op->params4.f32_tanh, sizeof(attention_op->params4.f32_tanh), + &attention_op->params4.unary, sizeof(attention_op->params4.unary), threadpool); } diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c index a0d9764227b9..884eadb2669c 100644 --- a/src/operators/unary-elementwise-nc.c +++ b/src/operators/unary-elementwise-nc.c @@ -4,7 +4,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include #include #include @@ -21,11 +20,36 @@ #include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microparams.h" +#include "xnnpack/node-type.h" #include "xnnpack/operator-type.h" +#include "xnnpack/operator-utils.h" #include "xnnpack/operator.h" #include "xnnpack/params.h" #include "pthreadpool.h" +static uint32_t xnn_datatype_get_log2_element_size(enum xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_qcint4: + case xnn_datatype_qbint4: + case xnn_datatype_qdint8: + case xnn_datatype_qint8: + case xnn_datatype_quint8: + case xnn_datatype_qcint8: + case xnn_datatype_qpint8: + return 0; + case xnn_datatype_fp16: + return 1; + case xnn_datatype_qint32: + case xnn_datatype_qcint32: + case xnn_datatype_int32: + case xnn_datatype_fp32: + return 2; + case xnn_datatype_invalid: + default: + XNN_UNREACHABLE; + } +} + static xnn_status_t check_op_type(xnn_operator_t op, enum xnn_operator_type expected_type) { if (op->type != expected_type) { @@ -39,25 +63,545 @@ static xnn_status_t check_op_type(xnn_operator_t op, return xnn_status_success; } +typedef float (*xnn_lut_init_fn)(float, const union xnn_unary_params*); + +static float calculate_elu(float x, const union xnn_unary_params* params) { + return signbit(x) ? params->elu.alpha * expm1f(x) : x; +} + +static float calculate_sigmoid(float x, const union xnn_unary_params* params) { + return signbit(x) ? 1.0f / (1.0f + expf(-x)) : 1.0f - 1.0f / (1.0f + expf(x)); +} + +static float calculate_tanh(float x, const union xnn_unary_params* params) { + return tanhf(x); +} + +static enum xnn_status init_lut_op( + xnn_operator_t op, + xnn_lut_init_fn init_fn, + int32_t min, + int32_t max, + const union xnn_unary_params* params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { + op->lookup_table = xnn_allocate_simd_memory(256 * sizeof(uint8_t)); + if (op->lookup_table == NULL) { + xnn_log_error( + "failed to allocate 256 bytes for %s operator lookup table", + xnn_operator_type_to_string(op->type)); + return xnn_status_out_of_memory; + } + + uint8_t* lookup_table = op->lookup_table; + const float inv_output_scale = 1.0f / output_quantization->scale; + for (int32_t i = min; i < min + 256; i++) { + const float dequantized_input = (i - input_quantization->zero_point) * input_quantization->scale; + const float dequantized_output = init_fn(dequantized_input, params); + long quantized_output = lrintf(dequantized_output * inv_output_scale) + output_quantization->zero_point; + lookup_table[(uint8_t) i] = (uint8_t) math_min_s32(max, math_max_s32(min, quantized_output)); + } + + const struct xnn_x8_lut_config* lut_config = xnn_init_x8_lut_config(); + op->lut_config = lut_config; + + op->state = xnn_run_state_invalid; + + return xnn_status_success; +} + +static enum xnn_status init_op_config(xnn_operator_t op, const struct xnn_unary_elementwise_config* config, const union xnn_unary_params* params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization) { + if (!config) { + xnn_log_error( + "failed to create config for operator %s", xnn_operator_type_to_string(op->type)); + return xnn_status_unsupported_parameter; + } + op->unary_elementwise_config = config; + op->state = xnn_run_state_invalid; + + if (config->init != NULL) { + config->init(&op->params.unary, params, input_quantization, output_quantization); + } + return xnn_status_success; +} + +static enum xnn_status init_op( + xnn_operator_t op, + enum xnn_unary_operator op_type, + enum xnn_datatype input_datatype, + enum xnn_datatype output_datatype, + const union xnn_unary_params* params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization, + uint32_t flags) { + op->type = xnn_unary_operator_to_operator_type(op_type); + op->flags = flags; + op->log2_elementwise_input_size = xnn_datatype_get_log2_element_size(input_datatype); + op->log2_elementwise_output_size = xnn_datatype_get_log2_element_size(output_datatype); + + if (input_datatype != output_datatype) { + if (op_type == xnn_unary_convert) { + if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_fp16) { + return init_op_config(op, xnn_init_f32_to_f16_cvt_config(), params, input_quantization, output_quantization); + } else if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_qint8) { + return init_op_config(op, xnn_init_f32_to_qs8_cvt_config(), params, input_quantization, output_quantization); + } else if (input_datatype == xnn_datatype_fp32 && output_datatype == xnn_datatype_quint8) { + return init_op_config(op, xnn_init_f32_to_qu8_cvt_config(), params, input_quantization, output_quantization); + } else if (input_datatype == xnn_datatype_fp16 && output_datatype == xnn_datatype_fp32) { + return init_op_config(op, xnn_init_f16_to_f32_cvt_config(), params, input_quantization, output_quantization); + } else if (input_datatype == xnn_datatype_fp16 && output_datatype == xnn_datatype_qint8) { + return init_op_config(op, xnn_init_f16_to_qs8_cvt_config(), params, input_quantization, output_quantization); + } else if (input_datatype == xnn_datatype_qint8 && output_datatype == xnn_datatype_fp16) { + return init_op_config(op, xnn_init_qs8_to_f16_cvt_config(), params, input_quantization, output_quantization); + } else if (input_datatype == xnn_datatype_qint8 && output_datatype == xnn_datatype_fp32) { + return init_op_config(op, xnn_init_qs8_to_f32_cvt_config(), params, input_quantization, output_quantization); + //} else if (input_datatype == xnn_datatype_qint16 && output_datatype == xnn_datatype_qint8) { + // return init_op_config(op, xnn_init_qs16_to_qs8_cvt_config(), params, input_quantization, output_quantization); + } else if (input_datatype == xnn_datatype_quint8 && output_datatype == xnn_datatype_fp32) { + return init_op_config(op, xnn_init_qu8_to_f32_cvt_config(), params, input_quantization, output_quantization); + } + } + xnn_log_error( + "failed to create unsupported operator %s for datatypes %s -> %s", + xnn_unary_operator_to_string(op_type), xnn_datatype_to_string(input_datatype), xnn_datatype_to_string(output_datatype)); + return xnn_status_unsupported_parameter; + } + enum xnn_datatype datatype = output_datatype; + if (datatype == xnn_datatype_qint8) { + switch (op_type) { + //case xnn_unary_abs: + // return init_op_config(op, xnn_init_qs8_abs_config(), params, input_quantization, output_quantization); + //case xnn_unary_bankers_rounding: + // return init_op_config(op, xnn_init_qs8_rndne_config(), params, input_quantization, output_quantization); + //case xnn_unary_ceiling: + // return init_op_config(op, xnn_init_qs8_rndu_config(), params, input_quantization, output_quantization); + case xnn_unary_clamp: + if (input_quantization->scale != output_quantization->scale || + input_quantization->zero_point != output_quantization->zero_point) { + xnn_log_error("failed to create unsupported operator clamp for datatype QINT8: quantization parameters differ"); + return xnn_status_unsupported_parameter; + } + return init_op_config(op, xnn_init_s8_clamp_config(), params, input_quantization, output_quantization); + case xnn_unary_exp: + break; + //case xnn_unary_floor: + // return init_op_config(op, xnn_init_qs8_rndd_config(), params, input_quantization, output_quantization); + case xnn_unary_gelu: + break; + //case xnn_unary_hardswish: + // return init_op_config(op, xnn_init_qs8_hswish_config(), params, input_quantization, output_quantization); + case xnn_unary_leaky_relu: + return init_op_config(op, xnn_init_qs8_lrelu_config(), params, input_quantization, output_quantization); + case xnn_unary_log: + break; + //case xnn_unary_negate: + // return init_op_config(op, xnn_init_qs8_neg_config(), params, input_quantization, output_quantization); + //case xnn_unary_reciprocal_square_root: + // return init_op_config(op, xnn_init_qs8_rsqrt_config(), params, input_quantization, output_quantization); + //case xnn_unary_square_root: + // return init_op_config(op, xnn_init_qs8_sqrt_config(), params, input_quantization, output_quantization); + //case xnn_unary_square: + // return init_op_config(op, xnn_init_qs8_sqr_config(), params, input_quantization, output_quantization); + case xnn_unary_convert: + return init_op_config(op, xnn_init_qs8_cvt_config(), params, input_quantization, output_quantization); + case xnn_unary_elu: + return init_lut_op(op, calculate_elu, INT8_MIN, INT8_MAX,params, input_quantization, output_quantization); + case xnn_unary_sigmoid: + return init_lut_op(op, calculate_sigmoid, INT8_MIN, INT8_MAX,params, input_quantization, output_quantization); + case xnn_unary_tanh: + return init_lut_op(op, calculate_tanh, INT8_MIN, INT8_MAX, params, input_quantization, output_quantization); + default: break; + } + } else if (datatype == xnn_datatype_quint8) { + switch (op_type) { + //case xnn_unary_abs: + // return init_op_config(op, xnn_init_qu8_abs_config(), params, input_quantization, output_quantization); + //case xnn_unary_bankers_rounding: + // return init_op_config(op, xnn_init_qu8_rndne_config(), params, input_quantization, output_quantization); + //case xnn_unary_ceiling: + // return init_op_config(op, xnn_init_qu8_rndu_config(), params, input_quantization, output_quantization); + case xnn_unary_clamp: + if (input_quantization->scale != output_quantization->scale || + input_quantization->zero_point != output_quantization->zero_point) { + xnn_log_error("failed to create unsupported operator clamp for datatype QUINT8: quantization parameters differ"); + return xnn_status_unsupported_parameter; + } + return init_op_config(op, xnn_init_u8_clamp_config(), params, input_quantization, output_quantization); + case xnn_unary_exp: + break; + //case xnn_unary_floor: + // return init_op_config(op, xnn_init_qu8_rndd_config(), params, input_quantization, output_quantization); + case xnn_unary_gelu: + break; + //case xnn_unary_hardswish: + // return init_op_config(op, xnn_init_qu8_hswish_config(), params, input_quantization, output_quantization); + case xnn_unary_leaky_relu: + return init_op_config(op, xnn_init_qu8_lrelu_config(), params, input_quantization, output_quantization); + case xnn_unary_log: + break; + //case xnn_unary_negate: + // return init_op_config(op, xnn_init_qu8_neg_config(), params, input_quantization, output_quantization); + //case xnn_unary_reciprocal_square_root: + // return init_op_config(op, xnn_init_qu8_rsqrt_config(), params, input_quantization, output_quantization); + //case xnn_unary_square_root: + // return init_op_config(op, xnn_init_qu8_sqrt_config(), params, input_quantization, output_quantization); + //case xnn_unary_square: + // return init_op_config(op, xnn_init_qu8_sqr_config(), params, input_quantization, output_quantization); + case xnn_unary_convert: + return init_op_config(op, xnn_init_qu8_cvt_config(), params, input_quantization, output_quantization); + case xnn_unary_elu: + return init_lut_op(op, calculate_elu, 0, UINT8_MAX, params, input_quantization, output_quantization); + case xnn_unary_sigmoid: + return init_lut_op(op, calculate_sigmoid, 0, UINT8_MAX, params, input_quantization, output_quantization); + case xnn_unary_tanh: + return init_lut_op(op, calculate_tanh, 0, UINT8_MAX, params, input_quantization, output_quantization); + default: break; + } + } else if (datatype == xnn_datatype_fp16) { + switch (op_type) { + case xnn_unary_abs: + return init_op_config(op, xnn_init_f16_abs_config(), params, input_quantization, output_quantization); + case xnn_unary_bankers_rounding: + return init_op_config(op, xnn_init_f16_rndne_config(), params, input_quantization, output_quantization); + case xnn_unary_ceiling: + return init_op_config(op, xnn_init_f16_rndu_config(), params, input_quantization, output_quantization); + case xnn_unary_clamp: + return init_op_config(op, xnn_init_f16_clamp_config(), params, input_quantization, output_quantization); + case xnn_unary_elu: + return init_op_config(op, xnn_init_f16_elu_config(), params, input_quantization, output_quantization); + case xnn_unary_exp: + break; + case xnn_unary_floor: + return init_op_config(op, xnn_init_f16_rndd_config(), params, input_quantization, output_quantization); + case xnn_unary_gelu: + break; + case xnn_unary_hardswish: + return init_op_config(op, xnn_init_f16_hswish_config(), params, input_quantization, output_quantization); + case xnn_unary_leaky_relu: + return init_op_config(op, xnn_init_f16_lrelu_config(), params, input_quantization, output_quantization); + case xnn_unary_log: + break; + case xnn_unary_negate: + return init_op_config(op, xnn_init_f16_neg_config(), params, input_quantization, output_quantization); + case xnn_unary_reciprocal_square_root: + return init_op_config(op, xnn_init_f16_rsqrt_config(), params, input_quantization, output_quantization); + case xnn_unary_sigmoid: + return init_op_config(op, xnn_init_f16_sigmoid_config(), params, input_quantization, output_quantization); + case xnn_unary_square_root: + return init_op_config(op, xnn_init_f16_sqrt_config(), params, input_quantization, output_quantization); + case xnn_unary_square: + return init_op_config(op, xnn_init_f16_sqr_config(), params, input_quantization, output_quantization); + case xnn_unary_tanh: + return init_op_config(op, xnn_init_f16_tanh_config(), params, input_quantization, output_quantization); + default: break; + } + } else if (datatype == xnn_datatype_fp32) { + switch (op_type) { + case xnn_unary_abs: + return init_op_config(op, xnn_init_f32_abs_config(), params, input_quantization, output_quantization); + case xnn_unary_bankers_rounding: + return init_op_config(op, xnn_init_f32_rndne_config(), params, input_quantization, output_quantization); + case xnn_unary_ceiling: + return init_op_config(op, xnn_init_f32_rndu_config(), params, input_quantization, output_quantization); + case xnn_unary_clamp: + return init_op_config(op, xnn_init_f32_clamp_config(), params, input_quantization, output_quantization); + case xnn_unary_elu: + return init_op_config(op, xnn_init_f32_elu_config(), params, input_quantization, output_quantization); + case xnn_unary_exp: + return init_op_config(op, xnn_init_f32_exp_config(), params, input_quantization, output_quantization); + case xnn_unary_floor: + return init_op_config(op, xnn_init_f32_rndd_config(), params, input_quantization, output_quantization); + case xnn_unary_gelu: + return init_op_config(op, xnn_init_f32_gelu_config(), params, input_quantization, output_quantization); + case xnn_unary_hardswish: + return init_op_config(op, xnn_init_f32_hswish_config(), params, input_quantization, output_quantization); + case xnn_unary_leaky_relu: + return init_op_config(op, xnn_init_f32_lrelu_config(), params, input_quantization, output_quantization); + case xnn_unary_log: + return init_op_config(op, xnn_init_f32_log_config(), params, input_quantization, output_quantization); + case xnn_unary_negate: + return init_op_config(op, xnn_init_f32_neg_config(), params, input_quantization, output_quantization); + case xnn_unary_reciprocal_square_root: + return init_op_config(op, xnn_init_f32_rsqrt_config(), params, input_quantization, output_quantization); + case xnn_unary_sigmoid: + return init_op_config(op, xnn_init_f32_sigmoid_config(), params, input_quantization, output_quantization); + case xnn_unary_square_root: + return init_op_config(op, xnn_init_f32_sqrt_config(), params, input_quantization, output_quantization); + case xnn_unary_square: + return init_op_config(op, xnn_init_f32_sqr_config(), params, input_quantization, output_quantization); + case xnn_unary_tanh: + return init_op_config(op, xnn_init_f32_tanh_config(), params, input_quantization, output_quantization); + default: break; + } + } + xnn_log_error( + "failed to create unsupported operator %s for datatype %s", + xnn_unary_operator_to_string(op_type), xnn_datatype_to_string(datatype)); + return xnn_status_unsupported_parameter; +} + +enum xnn_status xnn_create_unary_elementwise_nc( + enum xnn_unary_operator op_type, + enum xnn_datatype input_datatype, + enum xnn_datatype output_datatype, + const union xnn_unary_params* params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization, + uint32_t flags, + xnn_operator_t* op_out) { + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error("failed to create %s operator: XNNPACK is not initialized", + xnn_unary_operator_to_string(op_type)); + return xnn_status_uninitialized; + } + + xnn_operator_t op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); + if (op == NULL) { + xnn_log_error( + "failed to allocate %zu bytes for %s operator descriptor", + sizeof(struct xnn_operator), xnn_unary_operator_to_string(op_type)); + return xnn_status_out_of_memory; + } + + enum xnn_status status = init_op(op, op_type, input_datatype, output_datatype, params, input_quantization, output_quantization,flags); + if (status != xnn_status_success) { + xnn_delete_operator(op); + return status; + } + + *op_out = op; + return xnn_status_success; +} + +static bool is_contiguous(xnn_operator_t op) +{ + const size_t channels = op->channels; + const size_t input_stride = op->input_pixel_stride; + const size_t output_stride = op->output_pixel_stride; + const size_t batch_size = op->batch_size; + return (((input_stride ^ channels) | (output_stride ^ channels)) == 0) || batch_size == 1; +} + +enum xnn_status xnn_reshape_unary_elementwise_nc( + xnn_operator_t op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool) { + op->state = xnn_run_state_invalid; + + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error( + "failed to setup %s operator: XNNPACK is not initialized", + xnn_operator_type_to_string(op->type)); + return xnn_status_uninitialized; + } + + if (batch_size == 0 || channels == 0) { + op->state = xnn_run_state_skip; + return xnn_status_success; + } + + if (input_stride < channels) { + xnn_log_error( + "failed to create %s operator with input element stride of %zu: " + "stride must be at least as large as the number of channels (%zu)", + xnn_operator_type_to_string(op->type), input_stride, channels); + return xnn_status_invalid_parameter; + } + + if (output_stride < channels) { + xnn_log_error( + "failed to create %s operator with output element stride of %zu: " + "stride must be at least as large as the number of channels (%zu)", + xnn_operator_type_to_string(op->type), output_stride, channels); + return xnn_status_invalid_parameter; + } + + op->batch_size = batch_size; + op->channels = channels; + op->input_pixel_stride = input_stride; + op->output_pixel_stride = output_stride; + + if (op->lookup_table) { + const struct xnn_x8_lut_config* lut_config = op->lut_config; + if (is_contiguous(op)) { + op->context.lut_contiguous = (struct lut_contiguous_context) { + .x_stride = input_stride * sizeof(uint8_t), + .t = op->lookup_table, + .y_stride = output_stride * sizeof(uint8_t), + .ukernel = lut_config->microkernel, + }; + + const size_t range = batch_size * channels * sizeof(uint8_t); + size_t tile = range; + if (pthreadpool_get_threads_count(threadpool) > 1) { + const size_t block_size = 1024; + tile = block_size * sizeof(uint8_t); + } + + op->compute[0].type = xnn_parallelization_type_1d_tile_1d; + op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_lut_contiguous; + op->compute[0].range[0] = range; + op->compute[0].tile[0] = tile; + } else { + op->context.lut_strided = (struct lut_strided_context) { + .n = channels * sizeof(uint8_t), + .x_stride = input_stride * sizeof(uint8_t), + .t = op->lookup_table, + .y_stride = output_stride * sizeof(uint8_t), + .ukernel = lut_config->microkernel, + }; + op->compute[0].type = xnn_parallelization_type_1d; + op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_lut_strided; + op->compute[0].range[0] = batch_size; + } + } else { + const xnn_vunary_ukernel_fn ukernel = op->unary_elementwise_config->ukernel; + const size_t num_threads = pthreadpool_get_threads_count(threadpool); + if (is_contiguous(op)) { + const size_t block_size = 4096; + + op->context.univector_contiguous = (struct univector_contiguous_context) { + .log2_xsize = op->log2_elementwise_input_size, + .log2_ysize = op->log2_elementwise_output_size, + .ukernel = ukernel, + }; + memcpy(&op->context.univector_contiguous.params, &op->params.unary, sizeof(op->params.unary)); + + const size_t range = (batch_size * channels) << op->log2_elementwise_input_size; + op->compute[0].type = xnn_parallelization_type_1d_tile_1d; + op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_contiguous; + op->compute[0].range[0] = range; + op->compute[0].tile[0] = (num_threads == 1) ? range : block_size; + } else { + op->context.univector_strided = (struct univector_strided_context) { + .n = channels << op->log2_elementwise_input_size, + .x_stride = input_stride << op->log2_elementwise_input_size, + .y_stride = output_stride << op->log2_elementwise_output_size, + .ukernel = ukernel, + }; + memcpy(&op->context.univector_strided.params, &op->params.unary, sizeof(op->params.unary)); + + op->compute[0].type = xnn_parallelization_type_1d_tile_1d; + op->compute[0].task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_strided; + op->compute[0].range[0] = batch_size; + op->compute[0].tile[0] = (num_threads == 1) ? batch_size : 1; + } + } + op->state = xnn_run_state_needs_setup; + return xnn_status_success; +} + +enum xnn_status xnn_setup_unary_elementwise_nc( + xnn_operator_t op, + const void* input, + void* output) { + switch (op->state) { + case xnn_run_state_skip: + return xnn_status_success; + case xnn_run_state_invalid: + xnn_log_error( + "failed to setup %s operator: operator has not been reshaped yet", + xnn_operator_type_to_string(op->type)); + return xnn_status_invalid_state; + case xnn_run_state_needs_setup: + // Operator has been reshaped, but not setup, continue with setup. + case xnn_run_state_ready: + // Operator has been reshaped, and we are setting up with different pointers. + break; + } + + if (op->lookup_table) { + if (is_contiguous(op)) { + op->context.lut_contiguous.x = input; + op->context.lut_contiguous.y = output; + } else { + op->context.lut_strided.x = input; + op->context.lut_strided.y = output; + } + } else { + if (is_contiguous(op)) { + op->context.univector_contiguous.x = input; + op->context.univector_contiguous.y = output; + } else { + op->context.univector_strided.x = input; + op->context.univector_strided.y = output; + } + } + op->state = xnn_run_state_ready; + + return xnn_status_success; +} + +enum xnn_status xnn_run_unary_elementwise_nc( + // create parameters + enum xnn_unary_operator op_type, + enum xnn_datatype input_datatype, + enum xnn_datatype output_datatype, + const union xnn_unary_params* params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization, + uint32_t flags, + // reshape parameters + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool, + // setup parameters + const void* input, + void* output) { + + if (batch_size == 0 || channels == 0) { + return xnn_status_success; + } + + struct xnn_operator op; + memset(&op, 0, sizeof(op)); + + enum xnn_status status = init_op(&op, op_type, input_datatype, output_datatype, params, input_quantization, output_quantization, flags); + if (status != xnn_status_success) { + xnn_destroy_operator(&op); + return status; + } + + status = xnn_reshape_unary_elementwise_nc(&op, batch_size, channels, input_stride, output_stride, threadpool); + if (status != xnn_status_success){ + xnn_destroy_operator(&op); + return status; + } + + status = xnn_setup_unary_elementwise_nc(&op, input, output); + if (status != xnn_status_success){ + xnn_destroy_operator(&op); + return status; + } + + status = xnn_run_operator(&op, threadpool); + xnn_destroy_operator(&op); + return status; +} + static void init_unary_elementwise_nc( uint32_t flags, const void* params, size_t params_size, enum xnn_operator_type operator_type, const struct xnn_unary_elementwise_config* unary_elementwise_config, - const struct xnn_reduce_config* rminmax_config, xnn_operator_t unary_elementwise_op) { assert(unary_elementwise_config != NULL); assert(unary_elementwise_config->ukernel != NULL); - assert(rminmax_config == NULL || rminmax_config->ukernel != NULL); if (params_size != 0) { memcpy(&unary_elementwise_op->params, params, params_size); } unary_elementwise_op->unary_elementwise_config = unary_elementwise_config; - unary_elementwise_op->rminmax_config = rminmax_config; unary_elementwise_op->type = operator_type; unary_elementwise_op->flags = flags; @@ -67,7 +611,6 @@ static void init_unary_elementwise_nc( static enum xnn_status create_unary_elementwise_nc( uint32_t flags, const struct xnn_unary_elementwise_config* unary_elementwise_config, - const struct xnn_reduce_config* rminmax_config, const void* params, size_t params_size, enum xnn_operator_type operator_type, @@ -98,7 +641,7 @@ static enum xnn_status create_unary_elementwise_nc( init_unary_elementwise_nc( flags, params, params_size, - operator_type, unary_elementwise_config, rminmax_config, unary_elementwise_op); + operator_type, unary_elementwise_config, unary_elementwise_op); *unary_elementwise_op_out = unary_elementwise_op; return xnn_status_success; @@ -251,3704 +794,550 @@ static enum xnn_status setup_unary_elementwise_nc( return xnn_status_success; } -enum xnn_status xnn_create_abs_nc_f16( - uint32_t flags, - xnn_operator_t* abs_op_out) +enum xnn_status xnn_create_convert_nc_f16_qd8( + uint32_t flags, + xnn_operator_t* convert_op_out) { - const struct xnn_unary_elementwise_config* f16_abs_config = xnn_init_f16_abs_config(); + const struct xnn_reduce_config* f16_rminmax_config = xnn_init_f16_rminmax_config(); + if (f16_rminmax_config == NULL) { + xnn_log_error( + "failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8)); + return xnn_status_unsupported_hardware; + } struct xnn_f16_default_params params; - if XNN_LIKELY(f16_abs_config != NULL && f16_abs_config->init.f16_default != NULL) { - f16_abs_config->init.f16_default(¶ms); + if (f16_rminmax_config->init.f16_default != NULL) { + f16_rminmax_config->init.f16_default(¶ms); } - return create_unary_elementwise_nc( - flags, f16_abs_config, /*rminmax_config=*/NULL, + enum xnn_status status = create_unary_elementwise_nc( + flags, xnn_init_f16_to_qs8_cvt_config(), ¶ms, sizeof(params), - xnn_operator_type_abs_nc_f16, abs_op_out); + xnn_operator_type_convert_nc_f16_qd8, convert_op_out); + (*convert_op_out)->rminmax_config = f16_rminmax_config; + return status; } -enum xnn_status xnn_create_abs_nc_f32( - uint32_t flags, - xnn_operator_t* abs_op_out) +enum xnn_status xnn_create_convert_nc_f32_qd8( + uint32_t flags, + xnn_operator_t* convert_op_out) { - const struct xnn_unary_elementwise_config* f32_abs_config = xnn_init_f32_abs_config(); + const struct xnn_reduce_config* f32_rminmax_config = xnn_init_f32_rminmax_config(); + if (f32_rminmax_config == NULL) { + xnn_log_error( + "failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8)); + return xnn_status_unsupported_hardware; + } struct xnn_f32_default_params params; - if XNN_LIKELY(f32_abs_config != NULL && f32_abs_config->init.f32_default != NULL) { - f32_abs_config->init.f32_default(¶ms); + if (f32_rminmax_config->init.f32_default != NULL) { + f32_rminmax_config->init.f32_default(¶ms); } - return create_unary_elementwise_nc( - flags, f32_abs_config, /*rminmax_config=*/NULL, + enum xnn_status status = create_unary_elementwise_nc( + flags, xnn_init_f32_to_qs8_cvt_config(), ¶ms, sizeof(params), - xnn_operator_type_abs_nc_f32, abs_op_out); -} - -enum xnn_status xnn_create_bankers_rounding_nc_f16( - uint32_t flags, - xnn_operator_t* rounding_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_f16_rndne_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_bankers_rounding_nc_f16, rounding_op_out); + xnn_operator_type_convert_nc_f32_qd8, convert_op_out); + (*convert_op_out)->rminmax_config = f32_rminmax_config; + return status; } -enum xnn_status xnn_create_bankers_rounding_nc_f32( - uint32_t flags, - xnn_operator_t* rounding_op_out) -{ - const struct xnn_unary_elementwise_config* f32_rndne_config = xnn_init_f32_rndne_config(); +enum xnn_status xnn_create_convert_nc_f32_qp8(uint32_t flags, + xnn_operator_t* convert_op_out) { + const struct xnn_reduce_config* f32_rminmax_config = + xnn_init_f32_rminmax_config(); + if (f32_rminmax_config == NULL) { + xnn_log_error( + "failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qp8)); + return xnn_status_unsupported_hardware; + } - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndne_config != NULL && f32_rndne_config->init.f32_rnd != NULL) { - f32_rndne_config->init.f32_rnd(¶ms); + struct xnn_f32_default_params params; + if (f32_rminmax_config->init.f32_default != NULL) { + f32_rminmax_config->init.f32_default(¶ms); } - return create_unary_elementwise_nc( - flags, f32_rndne_config, /*rminmax_config=*/NULL, + enum xnn_status status = create_unary_elementwise_nc( + flags, xnn_init_f32_to_qp8_cvt_config(), ¶ms, sizeof(params), - xnn_operator_type_bankers_rounding_nc_f32, rounding_op_out); + xnn_operator_type_convert_nc_f32_qp8, convert_op_out); + (*convert_op_out)->rminmax_config = f32_rminmax_config; + return status; } -enum xnn_status xnn_create_ceiling_nc_f16( +enum xnn_status xnn_create_copy_nc_x8( uint32_t flags, - xnn_operator_t* ceiling_op_out) + xnn_operator_t* copy_op_out) { return create_unary_elementwise_nc( - flags, xnn_init_f16_rndu_config(), /*rminmax_config=*/NULL, + flags, xnn_init_xx_copy_config(), /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_ceiling_nc_f16, ceiling_op_out); + xnn_operator_type_copy_nc_x8, copy_op_out); } -enum xnn_status xnn_create_ceiling_nc_f32( +enum xnn_status xnn_create_copy_nc_x16( uint32_t flags, - xnn_operator_t* ceiling_op_out) + xnn_operator_t* copy_op_out) { - const struct xnn_unary_elementwise_config* f32_rndu_config = xnn_init_f32_rndu_config(); - - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndu_config != NULL && f32_rndu_config->init.f32_rnd != NULL) { - f32_rndu_config->init.f32_rnd(¶ms); - } - return create_unary_elementwise_nc( - flags, f32_rndu_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_ceiling_nc_f32, ceiling_op_out); + flags, xnn_init_xx_copy_config(), + /*params=*/NULL, /*params_size=*/0, + xnn_operator_type_copy_nc_x16, copy_op_out); } -enum xnn_status xnn_create_clamp_nc_f16( - float output_min, - float output_max, +enum xnn_status xnn_create_copy_nc_x32( uint32_t flags, - xnn_operator_t* clamp_op_out) + xnn_operator_t* copy_op_out) { - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to create %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f16)); - return xnn_status_uninitialized; - } - - if (isnan(output_min)) { - xnn_log_error( - "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f16)); - return xnn_status_invalid_parameter; - } - - if (isnan(output_max)) { - xnn_log_error( - "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f16)); - return xnn_status_invalid_parameter; - } - - const xnn_float16 output_min_as_half = xnn_float16_from_float(output_min); - const xnn_float16 output_max_as_half = xnn_float16_from_float(output_max); - output_min = xnn_float16_to_float(output_min_as_half); - output_max = xnn_float16_to_float(output_max_as_half); - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f16), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f16_clamp_config = xnn_init_f16_clamp_config(); - - union xnn_f16_minmax_params params; - if XNN_LIKELY(f16_clamp_config != NULL) { - assert(f16_clamp_config->init.f16_minmax != NULL); - f16_clamp_config->init.f16_minmax(¶ms, output_min_as_half, output_max_as_half); - } - return create_unary_elementwise_nc( - flags, f16_clamp_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_clamp_nc_f16, clamp_op_out); + flags, xnn_init_xx_copy_config(), + /*params=*/NULL, /*params_size=*/0, + xnn_operator_type_copy_nc_x32, copy_op_out); } -enum xnn_status xnn_create_clamp_nc_f32( - float output_min, - float output_max, - uint32_t flags, - xnn_operator_t* clamp_op_out) +enum xnn_status xnn_reshape_convert_nc_f16_qd8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool) { - if (isnan(output_min)) { - xnn_log_error( - "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32)); + if (convert_op->type != xnn_operator_type_convert_nc_f16_qd8) { + xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8), + xnn_operator_type_to_string(convert_op->type)); return xnn_status_invalid_parameter; } + convert_op->state = xnn_run_state_invalid; - if (isnan(output_max)) { - xnn_log_error( - "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32)); - return xnn_status_invalid_parameter; + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error("failed to setup %s operator: XNNPACK is not initialized", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8)); + return xnn_status_uninitialized; } - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), output_min, output_max); - return xnn_status_invalid_parameter; + if (batch_size == 0) { + convert_op->state = xnn_run_state_skip; + return xnn_status_success; } - const struct xnn_unary_elementwise_config* f32_clamp_config = xnn_init_f32_clamp_config(); - const struct xnn_unary_elementwise_config* f32_relu_config = xnn_init_f32_relu_config(); + convert_op->batch_size = batch_size; - const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f); - const struct xnn_unary_elementwise_config* unary_elementwise_config = f32_clamp_config; - if (relu_activation && f32_relu_config != NULL && f32_relu_config->ukernel != NULL) { - unary_elementwise_config = f32_relu_config; - } + convert_op->context.f16_qd8_convert = (struct f16_qd8_convert_context) { + .n = channels * sizeof(uint16_t), + .x_stride = input_stride * sizeof(uint16_t), + .y_stride = output_stride, + .batch_size = batch_size, + .rminmax_ukernel = convert_op->rminmax_config->ukernel, + .convert_ukernel = convert_op->unary_elementwise_config->ukernel, + .init_params = convert_op->unary_elementwise_config->init, + }; + memcpy(&convert_op->context.f16_qd8_convert.params, &convert_op->params.f16_default, sizeof(convert_op->params.f16_default)); - union xnn_f32_minmax_params params; - if XNN_LIKELY(f32_clamp_config != NULL) { - assert(f32_clamp_config->init.f32_minmax != NULL); - f32_clamp_config->init.f32_minmax(¶ms, output_min, output_max); - } + convert_op->compute[0].type = xnn_parallelization_type_1d; + convert_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_f16_qd8_convert; + convert_op->compute[0].range[0] = batch_size; - return create_unary_elementwise_nc( - flags, unary_elementwise_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_clamp_nc_f32, clamp_op_out); + convert_op->compute[1].type = xnn_parallelization_type_1d; + convert_op->compute[1].task_1d = (pthreadpool_task_1d_t) xnn_compute_pad_qd8_params; + convert_op->compute[1].range[0] = 1; + + convert_op->state = xnn_run_state_needs_setup; + + return xnn_status_success; } -enum xnn_status xnn_create_clamp_nc_s8( - int8_t output_min, - int8_t output_max, - uint32_t flags, - xnn_operator_t* clamp_op_out) +enum xnn_status xnn_reshape_convert_nc_f32_qd8( + xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool) { - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%" PRId8 ", %" PRId8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_s8), output_min, output_max); + if (convert_op->type != xnn_operator_type_convert_nc_f32_qd8) { + xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8), + xnn_operator_type_to_string(convert_op->type)); return xnn_status_invalid_parameter; } + convert_op->state = xnn_run_state_invalid; - const struct xnn_unary_elementwise_config* s8_clamp_config = xnn_init_s8_clamp_config(); - assert(s8_clamp_config != NULL); + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error("failed to setup %s operator: XNNPACK is not initialized", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8)); + return xnn_status_uninitialized; + } - struct xnn_s8_minmax_params params; - assert(s8_clamp_config->init.s8_minmax != NULL); - s8_clamp_config->init.s8_minmax(¶ms, output_min, output_max); + if (batch_size == 0) { + convert_op->state = xnn_run_state_skip; + return xnn_status_success; + } - return create_unary_elementwise_nc( - flags, s8_clamp_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_clamp_nc_s8, clamp_op_out); + convert_op->batch_size = batch_size; + + convert_op->context.f32_qd8_convert = (struct f32_qd8_convert_context) { + .n = channels * sizeof(float), + .x_stride = input_stride * sizeof(float), + .y_stride = output_stride, + .batch_size = batch_size, + .rminmax_ukernel = convert_op->rminmax_config->ukernel, + .convert_ukernel = convert_op->unary_elementwise_config->ukernel, + .init_params = convert_op->unary_elementwise_config->init, + }; + memcpy(&convert_op->context.f32_qd8_convert.params, &convert_op->params.f32_default, sizeof(convert_op->params.f32_default)); + + convert_op->compute[0].type = xnn_parallelization_type_1d; + convert_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_f32_qd8_convert; + convert_op->compute[0].range[0] = batch_size; + + convert_op->compute[1].type = xnn_parallelization_type_1d; + convert_op->compute[1].task_1d = (pthreadpool_task_1d_t) xnn_compute_pad_qd8_params; + convert_op->compute[1].range[0] = 1; + + convert_op->state = xnn_run_state_needs_setup; + + return xnn_status_success; } -enum xnn_status xnn_create_clamp_nc_u8( - uint8_t output_min, - uint8_t output_max, - uint32_t flags, - xnn_operator_t* clamp_op_out) -{ - if (output_min > output_max) { +enum xnn_status xnn_reshape_convert_nc_f32_qp8(xnn_operator_t convert_op, + size_t batch_size, + size_t channels, + size_t input_stride, + pthreadpool_t threadpool) { + if (convert_op->type != xnn_operator_type_convert_nc_f32_qp8) { xnn_log_error( - "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8), output_min, output_max); + "failed to setup operator: operator type mismatch (expected %s, got " + "%s)", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qp8), + xnn_operator_type_to_string(convert_op->type)); return xnn_status_invalid_parameter; } + convert_op->state = xnn_run_state_invalid; - const struct xnn_unary_elementwise_config* u8_clamp_config = xnn_init_u8_clamp_config(); - assert(u8_clamp_config != NULL); + if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { + xnn_log_error( + "failed to setup %s operator: XNNPACK is not initialized", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qp8)); + return xnn_status_uninitialized; + } - struct xnn_u8_minmax_params params; - assert(u8_clamp_config->init.u8_minmax != NULL); - u8_clamp_config->init.u8_minmax(¶ms, output_min, output_max); + if (batch_size == 0) { + convert_op->state = xnn_run_state_skip; + return xnn_status_success; + } - return create_unary_elementwise_nc( - flags, u8_clamp_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_clamp_nc_u8, clamp_op_out); + convert_op->batch_size = batch_size; + + const struct xnn_gemm_config* gemm_config = + xnn_init_qp8_f32_qc4w_gemm_config(); + const uint32_t mr_packed = batch_size == 1 ? 1 : gemm_config->mr_packed; + const uint32_t kr = UINT32_C(1) << gemm_config->log2_kr; + const uint32_t sr = UINT32_C(1) << gemm_config->log2_sr; + + convert_op->context.f32_qp8_convert = (struct f32_qp8_convert_context){ + .m = batch_size, + .k = channels, + .mr = mr_packed, + .kr = kr, + .sr = sr, + .lhs_stride = input_stride * sizeof(float), + .packq_ukernel = (xnn_x8_packq_f32qp8_ukernel_fn) + convert_op->unary_elementwise_config->ukernel, + }; + + // TODO(b/340399245) - Ideally, this should parallelize along `batch` in + // groups of `mr`. + convert_op->compute[0].type = xnn_parallelization_type_1d; + convert_op->compute[0].task_1d = + (pthreadpool_task_1d_t)xnn_compute_f32_qp8_convert; + convert_op->compute[0].range[0] = batch_size; + + convert_op->state = xnn_run_state_needs_setup; + + return xnn_status_success; } -enum xnn_status xnn_create_convert_nc_f16_f32( - uint32_t flags, - xnn_operator_t* convert_op_out) +enum xnn_status xnn_reshape_copy_nc_x8( + xnn_operator_t copy_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool) { - const struct xnn_unary_elementwise_config* f16_to_f32_cvt_config = xnn_init_f16_to_f32_cvt_config(); - - return create_unary_elementwise_nc( - flags, f16_to_f32_cvt_config, /*rminmax_config=*/NULL, + return reshape_unary_elementwise_nc( + copy_op, xnn_operator_type_copy_nc_x8, + batch_size, + channels, input_stride, output_stride, + /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT8_T, + /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT8_T, /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_convert_nc_f16_f32, convert_op_out); + threadpool); } -enum xnn_status xnn_create_convert_nc_f32_f16( - uint32_t flags, - xnn_operator_t* convert_op_out) +enum xnn_status xnn_reshape_copy_nc_x16( + xnn_operator_t copy_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool) { - const struct xnn_unary_elementwise_config* f32_to_f16_cvt_config = xnn_init_f32_to_f16_cvt_config(); + return reshape_unary_elementwise_nc( + copy_op, xnn_operator_type_copy_nc_x16, + batch_size, + channels, input_stride, output_stride, + /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT16_T, + /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT16_T, + /*params=*/NULL, /*params_size=*/0, + threadpool); +} - return create_unary_elementwise_nc( - flags, f32_to_f16_cvt_config, /*rminmax_config=*/NULL, +enum xnn_status xnn_reshape_copy_nc_x32( + xnn_operator_t copy_op, + size_t batch_size, + size_t channels, + size_t input_stride, + size_t output_stride, + pthreadpool_t threadpool) +{ + return reshape_unary_elementwise_nc( + copy_op, xnn_operator_type_copy_nc_x32, + batch_size, + channels, input_stride, output_stride, + /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT32_T, + /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT32_T, /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_convert_nc_f32_f16, convert_op_out); + threadpool); } -enum xnn_status xnn_create_convert_nc_f32_qs8( - float output_scale, - int8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) +enum xnn_status xnn_setup_convert_nc_f16_qd8( + xnn_operator_t convert_op, + const void* input, + int8_t* output, + struct xnn_quantization_params* quantization_params) { - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qs8), output_scale); + if (convert_op->type != xnn_operator_type_convert_nc_f16_qd8) { + xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8), + xnn_operator_type_to_string(convert_op->type)); return xnn_status_invalid_parameter; } - const struct xnn_unary_elementwise_config* f32_to_qs8_cvt_config = xnn_init_f32_to_qs8_cvt_config(); - - struct xnn_f32_qs8_cvt_params params; - if XNN_LIKELY(f32_to_qs8_cvt_config != NULL) { - assert(f32_to_qs8_cvt_config->init.f32_qs8_cvt != NULL); - f32_to_qs8_cvt_config->init.f32_qs8_cvt(¶ms, 1.0f / output_scale, output_zero_point); + switch (convert_op->state) { + case xnn_run_state_skip: + return xnn_status_success; + case xnn_run_state_invalid: + xnn_log_error( + "failed to setup %s operator: operator has not been reshaped yet", + xnn_operator_type_to_string(convert_op->type)); + return xnn_status_invalid_state; + case xnn_run_state_needs_setup: + // Operator has been reshaped, but not setup, continue with setup. + case xnn_run_state_ready: + // Operator has been reshaped, and we are setting up with different pointers. + break; } - return create_unary_elementwise_nc( - flags, f32_to_qs8_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_f32_qs8, convert_op_out); + convert_op->context.f16_qd8_convert.x = input; + convert_op->context.f16_qd8_convert.y = output; + convert_op->context.f16_qd8_convert.quantization_params = (struct xnn_qd8_quantization_params*) quantization_params; + convert_op->state = xnn_run_state_ready; + + return xnn_status_success; } -enum xnn_status xnn_create_convert_nc_f16_qd8( - uint32_t flags, - xnn_operator_t* convert_op_out) +enum xnn_status xnn_setup_convert_nc_f32_qd8( + xnn_operator_t convert_op, + const float* input, + int8_t* output, + struct xnn_quantization_params* quantization_params) { - const struct xnn_reduce_config* f16_rminmax_config = xnn_init_f16_rminmax_config(); - if (f16_rminmax_config == NULL) { - xnn_log_error( - "failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8)); - return xnn_status_unsupported_hardware; + if (convert_op->type != xnn_operator_type_convert_nc_f32_qd8) { + xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", + xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8), + xnn_operator_type_to_string(convert_op->type)); + return xnn_status_invalid_parameter; } - struct xnn_f16_default_params params; - if (f16_rminmax_config->init.f16_default != NULL) { - f16_rminmax_config->init.f16_default(¶ms); + switch (convert_op->state) { + case xnn_run_state_skip: + return xnn_status_success; + case xnn_run_state_invalid: + xnn_log_error( + "failed to setup %s operator: operator has not been reshaped yet", + xnn_operator_type_to_string(convert_op->type)); + return xnn_status_invalid_state; + case xnn_run_state_needs_setup: + // Operator has been reshaped, but not setup, continue with setup. + case xnn_run_state_ready: + // Operator has been reshaped, and we are setting up with different pointers. + break; } - return create_unary_elementwise_nc( - flags, xnn_init_f16_to_qs8_cvt_config(), f16_rminmax_config, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_f16_qd8, convert_op_out); + convert_op->context.f32_qd8_convert.x = input; + convert_op->context.f32_qd8_convert.y = output; + convert_op->context.f32_qd8_convert.quantization_params = (struct xnn_qd8_quantization_params*) quantization_params; + convert_op->state = xnn_run_state_ready; + + return xnn_status_success; } -enum xnn_status xnn_create_convert_nc_f32_qd8( - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - const struct xnn_reduce_config* f32_rminmax_config = xnn_init_f32_rminmax_config(); - if (f32_rminmax_config == NULL) { - xnn_log_error( - "failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_rminmax_config->init.f32_default != NULL) { - f32_rminmax_config->init.f32_default(¶ms); - } - - return create_unary_elementwise_nc( - flags, xnn_init_f32_to_qs8_cvt_config(), f32_rminmax_config, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_f32_qd8, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_f32_qp8(uint32_t flags, - xnn_operator_t* convert_op_out) { - const struct xnn_reduce_config* f32_rminmax_config = - xnn_init_f32_rminmax_config(); - if (f32_rminmax_config == NULL) { - xnn_log_error( - "failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qp8)); - return xnn_status_unsupported_hardware; - } - - struct xnn_f32_default_params params; - if (f32_rminmax_config->init.f32_default != NULL) { - f32_rminmax_config->init.f32_default(¶ms); - } - - return create_unary_elementwise_nc( - flags, xnn_init_f32_to_qp8_cvt_config(), f32_rminmax_config, ¶ms, - sizeof(params), xnn_operator_type_convert_nc_f32_qp8, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_f32_qu8( - float output_scale, - uint8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f32_to_qu8_cvt_config = xnn_init_f32_to_qu8_cvt_config(); - - struct xnn_f32_qu8_cvt_params params; - if XNN_LIKELY(f32_to_qu8_cvt_config != NULL) { - assert(f32_to_qu8_cvt_config->init.f32_qu8_cvt != NULL); - f32_to_qu8_cvt_config->init.f32_qu8_cvt(¶ms, 1.0f / output_scale, output_zero_point); - } - - return create_unary_elementwise_nc( - flags, f32_to_qu8_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_f32_qu8, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_qs8( - float input_scale, - int8_t input_zero_point, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs8), input_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - const float input_output_scale = input_scale / output_scale; - if (input_output_scale < 0x1.0p-8f || input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to create %s operator with %.7g input-to-output scale ratio: scale ratio must be in [2**-8, 2**7] range", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs8), input_output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qs8_cvt_config = xnn_init_qs8_cvt_config(); - assert(qs8_cvt_config != NULL); - - struct xnn_qs8_cvt_params params; - assert(qs8_cvt_config->init.qs8_cvt != NULL); - qs8_cvt_config->init.qs8_cvt(¶ms, input_output_scale, input_zero_point, output_zero_point); - - return create_unary_elementwise_nc( - flags, qs8_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_qs8, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_qs8_f16( - float input_scale, - int8_t input_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs8_f16), input_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qs8_to_f16_cvt_config = xnn_init_qs8_to_f16_cvt_config(); - - const xnn_float16 fp16_input_scale = xnn_float16_from_float(input_scale); - - struct xnn_qs8_f16_cvt_params params; - if XNN_LIKELY(qs8_to_f16_cvt_config != NULL) { - assert(qs8_to_f16_cvt_config->init.qs8_f16_cvt != NULL); - qs8_to_f16_cvt_config->init.qs8_f16_cvt(¶ms, fp16_input_scale, input_zero_point); - } - - return create_unary_elementwise_nc( - flags, qs8_to_f16_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_qs8_f16, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_qs8_f32( - float input_scale, - int8_t input_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs8_f32), input_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qs8_to_f32_cvt_config = xnn_init_qs8_to_f32_cvt_config(); - - struct xnn_qs8_f32_cvt_params params; - if XNN_LIKELY(qs8_to_f32_cvt_config != NULL) { - assert(qs8_to_f32_cvt_config->init.qs8_f32_cvt != NULL); - qs8_to_f32_cvt_config->init.qs8_f32_cvt(¶ms, input_scale, input_zero_point); - } - - return create_unary_elementwise_nc( - flags, qs8_to_f32_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_qs8_f32, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_qs16_qs8( - float input_scale, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs16_qs8), input_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs16_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - const float input_output_scale = input_scale / output_scale; - if (input_output_scale < 0x1.0p-16f || input_output_scale > 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input-to-output scale ratio: scale ratio must be in [2**-16, 2**8] range", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs16_qs8), input_output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qs16_to_qs8_cvt_config = xnn_init_qs16_to_qs8_cvt_config(); - assert(qs16_to_qs8_cvt_config != NULL); - - struct xnn_qs16_qs8_cvt_params params; - assert(qs16_to_qs8_cvt_config->init.qs16_qs8_cvt != NULL); - qs16_to_qs8_cvt_config->init.qs16_qs8_cvt(¶ms, input_output_scale, output_zero_point); - - return create_unary_elementwise_nc( - flags, qs16_to_qs8_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_qs16_qs8, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_qu8( - float input_scale, - uint8_t input_zero_point, - float output_scale, - uint8_t output_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qu8), input_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - const float input_output_scale = input_scale / output_scale; - if (input_output_scale < 0x1.0p-8f || input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to create %s operator with %.7g input-to-output scale ratio: scale ratio must be in [2**-8, 2**7] range", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qu8), input_output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qu8_cvt_config = xnn_init_qu8_cvt_config(); - assert(qu8_cvt_config != NULL); - - struct xnn_qu8_cvt_params params; - assert(qu8_cvt_config->init.qu8_cvt != NULL); - qu8_cvt_config->init.qu8_cvt(¶ms, input_output_scale, input_zero_point, output_zero_point); - - return create_unary_elementwise_nc( - flags, qu8_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_qu8, convert_op_out); -} - -enum xnn_status xnn_create_convert_nc_qu8_f32( - float input_scale, - uint8_t input_zero_point, - uint32_t flags, - xnn_operator_t* convert_op_out) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qu8_f32), input_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qu8_to_f32_cvt_config = xnn_init_qu8_to_f32_cvt_config(); - - struct xnn_qu8_f32_cvt_params params; - if XNN_LIKELY(qu8_to_f32_cvt_config != NULL) { - assert(qu8_to_f32_cvt_config->init.qu8_f32_cvt != NULL); - qu8_to_f32_cvt_config->init.qu8_f32_cvt(¶ms, input_scale, input_zero_point); - } - - return create_unary_elementwise_nc( - flags, qu8_to_f32_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_convert_nc_qu8_f32, convert_op_out); -} - -enum xnn_status xnn_create_copy_nc_x8( - uint32_t flags, - xnn_operator_t* copy_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_xx_copy_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_copy_nc_x8, copy_op_out); -} - -enum xnn_status xnn_create_copy_nc_x16( - uint32_t flags, - xnn_operator_t* copy_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_xx_copy_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_copy_nc_x16, copy_op_out); -} - -enum xnn_status xnn_create_copy_nc_x32( - uint32_t flags, - xnn_operator_t* copy_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_xx_copy_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_copy_nc_x32, copy_op_out); -} - -enum xnn_status xnn_create_elu_nc_f16( - float alpha, - uint32_t flags, - xnn_operator_t* elu_op_out) -{ - const xnn_float16 alpha_as_half = xnn_float16_from_float(alpha); - alpha = xnn_float16_to_float(alpha_as_half); - if (alpha <= 0.0f || !isnormal(alpha)) { - xnn_log_error( - "failed to create %s operator with %.7g alpha parameter: alpha must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_elu_nc_f16), alpha); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f16_elu_config = xnn_init_f16_elu_config(); - - struct xnn_f16_elu_params params; - if XNN_LIKELY(f16_elu_config != NULL) { - assert(f16_elu_config->init.f16_elu != NULL); - f16_elu_config->init.f16_elu(¶ms, - /*prescale=*/xnn_float16_from_float(1.0f), alpha_as_half, /*beta=*/xnn_float16_from_float(1.0f)); - } - - return create_unary_elementwise_nc( - flags, f16_elu_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_elu_nc_f16, elu_op_out); -} - -enum xnn_status xnn_create_elu_nc_f32( - float alpha, - uint32_t flags, - xnn_operator_t* elu_op_out) -{ - if (alpha <= 0.0f || !isnormal(alpha)) { - xnn_log_error( - "failed to create %s operator with %.7g alpha parameter: alpha must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_elu_nc_f32), alpha); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f32_elu_config = xnn_init_f32_elu_config(); - - struct xnn_f32_elu_params params; - if XNN_LIKELY(f32_elu_config != NULL) { - assert(f32_elu_config->init.f32_elu != NULL); - f32_elu_config->init.f32_elu(¶ms, 1.0f /* prescale */, alpha, 1.0f /* beta */); - } - - return create_unary_elementwise_nc( - flags, f32_elu_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_elu_nc_f32, elu_op_out); -} - -enum xnn_status xnn_create_floor_nc_f16( - uint32_t flags, - xnn_operator_t* floor_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_f16_rndd_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_floor_nc_f16, floor_op_out); -} - -enum xnn_status xnn_create_floor_nc_f32( - uint32_t flags, - xnn_operator_t* floor_op_out) -{ - const struct xnn_unary_elementwise_config* f32_rndd_config = xnn_init_f32_rndd_config(); - - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndd_config != NULL && f32_rndd_config->init.f32_rnd != NULL) { - f32_rndd_config->init.f32_rnd(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_rndd_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_floor_nc_f32, floor_op_out); -} - -enum xnn_status xnn_create_gelu_nc_f32(uint32_t flags, - xnn_operator_t* gelu_op_out) { - const struct xnn_unary_elementwise_config* f32_gelu_config = - xnn_init_f32_gelu_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY (f32_gelu_config != NULL) { - if (f32_gelu_config->init.f32_default != NULL) { - f32_gelu_config->init.f32_default(¶ms); - } - } - - return create_unary_elementwise_nc( - flags, f32_gelu_config, /*rminmax_config=*/NULL, ¶ms, sizeof(params), - xnn_operator_type_gelu_nc_f32, gelu_op_out); -} - -enum xnn_status xnn_create_hardswish_nc_f16( - uint32_t flags, - xnn_operator_t* hardswish_op_out) -{ - const struct xnn_unary_elementwise_config* f16_hswish_config = xnn_init_f16_hswish_config(); - - struct xnn_f16_hswish_params params; - if XNN_LIKELY(f16_hswish_config != NULL && f16_hswish_config->init.f16_hswish != NULL) { - f16_hswish_config->init.f16_hswish(¶ms); - } - - return create_unary_elementwise_nc( - flags, f16_hswish_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_hardswish_nc_f16, hardswish_op_out); -} - -enum xnn_status xnn_create_hardswish_nc_f32( - uint32_t flags, - xnn_operator_t* hardswish_op_out) -{ - const struct xnn_unary_elementwise_config* f32_hswish_config = xnn_init_f32_hswish_config(); - - struct xnn_f32_hswish_params params; - if XNN_LIKELY(f32_hswish_config != NULL && f32_hswish_config->init.f32_hswish != NULL) { - f32_hswish_config->init.f32_hswish(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_hswish_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_hardswish_nc_f32, hardswish_op_out); -} - -enum xnn_status xnn_create_leaky_relu_nc_f16( - float negative_slope, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out) -{ - const xnn_float16 negative_slope_as_half = xnn_float16_from_float(negative_slope); - negative_slope = xnn_float16_to_float(negative_slope_as_half); - if (!isfinite(negative_slope)) { - xnn_log_error( - "failed to create %s operator with %f negative slope: finite number expected", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_f16), - negative_slope); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f16_lrelu_config = xnn_init_f16_lrelu_config(); - - struct xnn_f16_lrelu_params params; - if XNN_LIKELY(f16_lrelu_config != NULL) { - assert(f16_lrelu_config->init.f16_lrelu != NULL); - f16_lrelu_config->init.f16_lrelu(¶ms, negative_slope_as_half); - } - - return create_unary_elementwise_nc( - flags, f16_lrelu_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_leaky_relu_nc_f16, leaky_relu_op_out); -} - -enum xnn_status xnn_create_leaky_relu_nc_f32( - float negative_slope, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out) -{ - if (!isfinite(negative_slope)) { - xnn_log_error( - "failed to create %s operator with %f negative slope: finite number expected", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_f32), - negative_slope); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f32_lrelu_config = xnn_init_f32_lrelu_config(); - - struct xnn_f32_lrelu_params params; - if XNN_LIKELY(f32_lrelu_config != NULL) { - assert(f32_lrelu_config->init.f32_lrelu != NULL); - f32_lrelu_config->init.f32_lrelu(¶ms, negative_slope); - } - - return create_unary_elementwise_nc( - flags, f32_lrelu_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_leaky_relu_nc_f32, leaky_relu_op_out); -} - -enum xnn_status xnn_create_log_nc_f32( - uint32_t flags, - xnn_operator_t* log_op_out) -{ - const struct xnn_unary_elementwise_config* f32_log_config = xnn_init_f32_log_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY(f32_log_config != NULL) { - if (f32_log_config->init.f32_default != NULL) { - f32_log_config->init.f32_default(¶ms); - } - } - - return create_unary_elementwise_nc( - flags, f32_log_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_log_nc_f32, log_op_out); -} - -enum xnn_status xnn_reshape_log_nc_f32( - xnn_operator_t log_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - log_op, xnn_operator_type_log_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &log_op->params.f32_default, sizeof(log_op->params.f32_default), - threadpool); -} - -enum xnn_status xnn_setup_log_nc_f32( - xnn_operator_t log_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - log_op, xnn_operator_type_log_nc_f32, - input, output); -} - -enum xnn_status xnn_create_leaky_relu_nc_qs8( - float negative_slope, - int8_t input_zero_point, - float input_scale, - int8_t output_zero_point, - float output_scale, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out) -{ - if (!isfinite(negative_slope)) { - xnn_log_error( - "failed to create %s operator with %f negative slope: finite number expected", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qs8), - negative_slope); - return xnn_status_invalid_parameter; - } - - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qs8), input_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qs8), input_scale); - return xnn_status_invalid_parameter; - } - - const float positive_input_output_scale = input_scale / output_scale; - if (positive_input_output_scale < 0x1.0p-8f || positive_input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to create %s operator with %.7g positive-input-to-output scale ratio: scale ratio must be in [2**-8, 2**7] range", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qs8), positive_input_output_scale); - return xnn_status_invalid_parameter; - } - - const float negative_input_output_scale = positive_input_output_scale * negative_slope; - if (negative_input_output_scale < -0x1.FFFC00p+6f || negative_input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to create %s operator with %.7g negative-input-to-output scale ratio: scale ratio must be in (-2**7, 2**7] range and ", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qs8), negative_input_output_scale); - return xnn_status_invalid_parameter; - } - - if (fabsf(negative_input_output_scale) < 0x1.0p-8f) { - xnn_log_error( - "failed to create %s operator with %.7g negative-input-to-output scale ratio: scale ratio must be at least 2**-8 in absolute value", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qs8), negative_input_output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qs8_lrelu_config = xnn_init_qs8_lrelu_config(); - assert(qs8_lrelu_config != NULL); - - struct xnn_qs8_lrelu_params params; - assert(qs8_lrelu_config->init.qs8_lrelu != NULL); - qs8_lrelu_config->init.qs8_lrelu(¶ms, positive_input_output_scale, negative_input_output_scale, input_zero_point, output_zero_point); - - return create_unary_elementwise_nc( - flags, qs8_lrelu_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_leaky_relu_nc_qs8, leaky_relu_op_out); -} - -enum xnn_status xnn_create_leaky_relu_nc_qu8( - float negative_slope, - uint8_t input_zero_point, - float input_scale, - uint8_t output_zero_point, - float output_scale, - uint32_t flags, - xnn_operator_t* leaky_relu_op_out) -{ - if (!isfinite(negative_slope)) { - xnn_log_error( - "failed to create %s operator with %f negative slope: finite number expected", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qu8), - negative_slope); - return xnn_status_invalid_parameter; - } - - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qu8), input_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qu8), input_scale); - return xnn_status_invalid_parameter; - } - - const float positive_input_output_scale = input_scale / output_scale; - if (positive_input_output_scale < 0x1.0p-8f || positive_input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to create %s operator with %.7g positive-input-to-output scale ratio: scale ratio must be in [2**-8, 2**7] range", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qu8), positive_input_output_scale); - return xnn_status_invalid_parameter; - } - - const float negative_input_output_scale = positive_input_output_scale * negative_slope; - if (negative_input_output_scale < -0x1.FFFC00p+6f || negative_input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to create %s operator with %.7g negative-input-to-output scale ratio: scale ratio must be in (-2**7, 2**7] range and ", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qu8), negative_input_output_scale); - return xnn_status_invalid_parameter; - } - - if (fabsf(negative_input_output_scale) < 0x1.0p-8f) { - xnn_log_error( - "failed to create %s operator with %.7g negative-input-to-output scale ratio: scale ratio must be at least 2**-8 in absolute value", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_qu8), negative_input_output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qu8_lrelu_config = xnn_init_qu8_lrelu_config(); - assert(qu8_lrelu_config != NULL); - - struct xnn_qu8_lrelu_params params; - assert(qu8_lrelu_config->init.qu8_lrelu != NULL); - qu8_lrelu_config->init.qu8_lrelu(¶ms, positive_input_output_scale, negative_input_output_scale, input_zero_point, output_zero_point); - - return create_unary_elementwise_nc( - flags, qu8_lrelu_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_leaky_relu_nc_qu8, leaky_relu_op_out); -} - -enum xnn_status xnn_create_negate_nc_f16( - uint32_t flags, - xnn_operator_t* negate_op_out) -{ - const struct xnn_unary_elementwise_config* f16_neg_config = xnn_init_f16_neg_config(); - - struct xnn_f16_default_params params; - if XNN_LIKELY(f16_neg_config != NULL && f16_neg_config->init.f16_default != NULL) { - f16_neg_config->init.f16_default(¶ms); - } - - return create_unary_elementwise_nc( - flags, f16_neg_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_negate_nc_f16, negate_op_out); -} - -enum xnn_status xnn_create_negate_nc_f32( - uint32_t flags, - xnn_operator_t* negate_op_out) -{ - const struct xnn_unary_elementwise_config* f32_neg_config = xnn_init_f32_neg_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY(f32_neg_config != NULL && f32_neg_config->init.f32_default != NULL) { - f32_neg_config->init.f32_default(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_neg_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_negate_nc_f32, negate_op_out); -} - -enum xnn_status xnn_create_sigmoid_nc_f16( - uint32_t flags, - xnn_operator_t* sigmoid_op_out) -{ - const struct xnn_unary_elementwise_config* f16_sigmoid_config = xnn_init_f16_sigmoid_config(); - - struct xnn_f16_sigmoid_params params; - if XNN_LIKELY(f16_sigmoid_config != NULL && f16_sigmoid_config->init.f16_sigmoid != NULL) { - f16_sigmoid_config->init.f16_sigmoid(¶ms); - } - - return create_unary_elementwise_nc( - flags, f16_sigmoid_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_sigmoid_nc_f16, sigmoid_op_out); -} - -enum xnn_status xnn_create_sigmoid_nc_f32( - uint32_t flags, - xnn_operator_t* sigmoid_op_out) -{ - const struct xnn_unary_elementwise_config* f32_sigmoid_config = xnn_init_f32_sigmoid_config(); - - struct xnn_f32_sigmoid_params params; - if XNN_LIKELY(f32_sigmoid_config != NULL && f32_sigmoid_config->init.f32_sigmoid != NULL) { - f32_sigmoid_config->init.f32_sigmoid(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_sigmoid_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_sigmoid_nc_f32, sigmoid_op_out); -} - -enum xnn_status xnn_create_square_nc_f16( - uint32_t flags, - xnn_operator_t* square_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_f16_sqr_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_square_nc_f16, square_op_out); -} - -enum xnn_status xnn_create_square_nc_f32( - uint32_t flags, - xnn_operator_t* square_op_out) -{ - const struct xnn_unary_elementwise_config* f32_sqr_config = xnn_init_f32_sqr_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY(f32_sqr_config != NULL && f32_sqr_config->init.f32_default != NULL) { - f32_sqr_config->init.f32_default(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_sqr_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_square_nc_f32, square_op_out); -} - -enum xnn_status xnn_create_square_root_nc_f16( - uint32_t flags, - xnn_operator_t* sqrt_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_f16_sqrt_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_square_root_nc_f16, sqrt_op_out); -} - -enum xnn_status xnn_create_square_root_nc_f32( - uint32_t flags, - xnn_operator_t* sqrt_op_out) -{ - const struct xnn_unary_elementwise_config* f32_sqrt_config = xnn_init_f32_sqrt_config(); - - struct xnn_f32_sqrt_params params; - if XNN_LIKELY(f32_sqrt_config != NULL && f32_sqrt_config->init.f32_sqrt != NULL) { - f32_sqrt_config->init.f32_sqrt(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_sqrt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_square_root_nc_f32, sqrt_op_out); -} - -enum xnn_status xnn_create_reciprocal_square_root_nc_f16( - uint32_t flags, - xnn_operator_t* rsqrt_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_f16_rsqrt_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_reciprocal_square_root_nc_f16, rsqrt_op_out); -} - -enum xnn_status xnn_create_reciprocal_square_root_nc_f32( - uint32_t flags, xnn_operator_t* rsqrt_op_out) { - const struct xnn_unary_elementwise_config* f32_rsqrt_config = - xnn_init_f32_rsqrt_config(); - - struct xnn_f32_rsqrt_params params; - if XNN_LIKELY (f32_rsqrt_config != NULL && - f32_rsqrt_config->init.f32_rsqrt != NULL) { - f32_rsqrt_config->init.f32_rsqrt(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_rsqrt_config, /*rminmax_config=*/NULL, ¶ms, sizeof(params), - xnn_operator_type_reciprocal_square_root_nc_f32, rsqrt_op_out); -} - -enum xnn_status xnn_create_tanh_nc_f16( - uint32_t flags, - xnn_operator_t* tanh_op_out) -{ - const struct xnn_unary_elementwise_config* f16_tanh_config = xnn_init_f16_tanh_config(); - - union xnn_f16_tanh_params params; - if XNN_LIKELY(f16_tanh_config != NULL && f16_tanh_config->init.f16_tanh != NULL) { - f16_tanh_config->init.f16_tanh(¶ms); - } - - return create_unary_elementwise_nc( - flags, f16_tanh_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_tanh_nc_f16, tanh_op_out); -} - -enum xnn_status xnn_create_tanh_nc_f32( - uint32_t flags, - xnn_operator_t* tanh_op_out) -{ - const struct xnn_unary_elementwise_config* f32_tanh_config = xnn_init_f32_tanh_config(); - - union xnn_f32_tanh_params params; - if XNN_LIKELY(f32_tanh_config != NULL && f32_tanh_config->init.f32_tanh != NULL) { - f32_tanh_config->init.f32_tanh(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_tanh_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_tanh_nc_f32, tanh_op_out); -} - -enum xnn_status xnn_create_truncation_nc_f16( - uint32_t flags, - xnn_operator_t* truncation_op_out) -{ - return create_unary_elementwise_nc( - flags, xnn_init_f16_rndz_config(), /*rminmax_config=*/NULL, - /*params=*/NULL, /*params_size=*/0, - xnn_operator_type_truncation_nc_f16, truncation_op_out); -} - -enum xnn_status xnn_create_truncation_nc_f32( - uint32_t flags, - xnn_operator_t* truncation_op_out) -{ - const struct xnn_unary_elementwise_config* f32_rndz_config = xnn_init_f32_rndz_config(); - - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndz_config != NULL && f32_rndz_config->init.f32_rnd != NULL) { - f32_rndz_config->init.f32_rnd(¶ms); - } - - return create_unary_elementwise_nc( - flags, f32_rndz_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_truncation_nc_f32, truncation_op_out); -} - -enum xnn_status xnn_reshape_abs_nc_f16( - xnn_operator_t abs_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - abs_op, xnn_operator_type_abs_nc_f16, - batch_size, - channels, - input_stride, - output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &abs_op->params.f16_default, sizeof(abs_op->params.f16_default), - threadpool); -} - -enum xnn_status xnn_reshape_abs_nc_f32( - xnn_operator_t abs_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - abs_op, xnn_operator_type_abs_nc_f32, - batch_size, - channels, - input_stride, - output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &abs_op->params.f32_default, sizeof(abs_op->params.f32_default), - threadpool); -} - -enum xnn_status xnn_reshape_bankers_rounding_nc_f16( - xnn_operator_t rounding_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - rounding_op, xnn_operator_type_bankers_rounding_nc_f16, - batch_size, - channels, - input_stride, - output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_bankers_rounding_nc_f32( - xnn_operator_t rounding_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - rounding_op, xnn_operator_type_bankers_rounding_nc_f32, - batch_size, - channels, - input_stride, - output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &rounding_op->params.f32_rnd, sizeof(rounding_op->params.f32_rnd), - threadpool); -} - -enum xnn_status xnn_reshape_ceiling_nc_f16( - xnn_operator_t ceiling_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - ceiling_op, xnn_operator_type_ceiling_nc_f16, - batch_size, - channels, - input_stride, - output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_ceiling_nc_f32( - xnn_operator_t ceiling_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - ceiling_op, xnn_operator_type_ceiling_nc_f32, - batch_size, - channels, - input_stride, - output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &ceiling_op->params.f32_rnd, sizeof(ceiling_op->params.f32_rnd), - threadpool); -} - -enum xnn_status xnn_reshape_clamp_nc_f16( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &clamp_op->params.f16_minmax, sizeof(clamp_op->params.f16_minmax), - threadpool); -} - -enum xnn_status xnn_reshape_clamp_nc_f32( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &clamp_op->params.f32_minmax, sizeof(clamp_op->params.f32_minmax), - threadpool); -} - -enum xnn_status xnn_reshape_clamp_nc_s8( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_s8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_INT8_T, - &clamp_op->params.s8_minmax, sizeof(clamp_op->params.s8_minmax), - threadpool); -} - -enum xnn_status xnn_reshape_clamp_nc_u8( - xnn_operator_t clamp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_u8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT8_T, - &clamp_op->params.u8_minmax, sizeof(clamp_op->params.u8_minmax), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_f16_f32( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f16_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_f32_f16( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f32_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_f16_qd8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - if (convert_op->type != xnn_operator_type_convert_nc_f16_qd8) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8), - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_parameter; - } - convert_op->state = xnn_run_state_invalid; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to setup %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8)); - return xnn_status_uninitialized; - } - - if (batch_size == 0) { - convert_op->state = xnn_run_state_skip; - return xnn_status_success; - } - - convert_op->batch_size = batch_size; - - convert_op->context.f16_qd8_convert = (struct f16_qd8_convert_context) { - .n = channels * sizeof(uint16_t), - .x_stride = input_stride * sizeof(uint16_t), - .y_stride = output_stride, - .batch_size = batch_size, - .rminmax_ukernel = convert_op->rminmax_config->ukernel, - .convert_ukernel = convert_op->unary_elementwise_config->ukernel, - .init_params = convert_op->unary_elementwise_config->init.f16_qs8_cvt, - }; - memcpy(&convert_op->context.f16_qd8_convert.params, &convert_op->params.f16_default, sizeof(convert_op->params.f16_default)); - - convert_op->compute[0].type = xnn_parallelization_type_1d; - convert_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_f16_qd8_convert; - convert_op->compute[0].range[0] = batch_size; - - convert_op->compute[1].type = xnn_parallelization_type_1d; - convert_op->compute[1].task_1d = (pthreadpool_task_1d_t) xnn_compute_pad_qd8_params; - convert_op->compute[1].range[0] = 1; - - convert_op->state = xnn_run_state_needs_setup; - - return xnn_status_success; -} - -enum xnn_status xnn_reshape_convert_nc_f32_qd8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - if (convert_op->type != xnn_operator_type_convert_nc_f32_qd8) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8), - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_parameter; - } - convert_op->state = xnn_run_state_invalid; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to setup %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8)); - return xnn_status_uninitialized; - } - - if (batch_size == 0) { - convert_op->state = xnn_run_state_skip; - return xnn_status_success; - } - - convert_op->batch_size = batch_size; - - convert_op->context.f32_qd8_convert = (struct f32_qd8_convert_context) { - .n = channels * sizeof(float), - .x_stride = input_stride * sizeof(float), - .y_stride = output_stride, - .batch_size = batch_size, - .rminmax_ukernel = convert_op->rminmax_config->ukernel, - .convert_ukernel = convert_op->unary_elementwise_config->ukernel, - .init_params = convert_op->unary_elementwise_config->init.f32_qs8_cvt, - }; - memcpy(&convert_op->context.f32_qd8_convert.params, &convert_op->params.f32_default, sizeof(convert_op->params.f32_default)); - - convert_op->compute[0].type = xnn_parallelization_type_1d; - convert_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_f32_qd8_convert; - convert_op->compute[0].range[0] = batch_size; - - convert_op->compute[1].type = xnn_parallelization_type_1d; - convert_op->compute[1].task_1d = (pthreadpool_task_1d_t) xnn_compute_pad_qd8_params; - convert_op->compute[1].range[0] = 1; - - convert_op->state = xnn_run_state_needs_setup; - - return xnn_status_success; -} - -enum xnn_status xnn_reshape_convert_nc_f32_qp8(xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - pthreadpool_t threadpool) { - if (convert_op->type != xnn_operator_type_convert_nc_f32_qp8) { - xnn_log_error( - "failed to setup operator: operator type mismatch (expected %s, got " - "%s)", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qp8), - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_parameter; - } - convert_op->state = xnn_run_state_invalid; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error( - "failed to setup %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qp8)); - return xnn_status_uninitialized; - } - - if (batch_size == 0) { - convert_op->state = xnn_run_state_skip; - return xnn_status_success; - } - - convert_op->batch_size = batch_size; - - const struct xnn_gemm_config* gemm_config = - xnn_init_qp8_f32_qc4w_gemm_config(); - const uint32_t mr_packed = batch_size == 1 ? 1 : gemm_config->mr_packed; - const uint32_t kr = UINT32_C(1) << gemm_config->log2_kr; - const uint32_t sr = UINT32_C(1) << gemm_config->log2_sr; - - convert_op->context.f32_qp8_convert = (struct f32_qp8_convert_context){ - .m = batch_size, - .k = channels, - .mr = mr_packed, - .kr = kr, - .sr = sr, - .lhs_stride = input_stride * sizeof(float), - .packq_ukernel = (xnn_x8_packq_f32qp8_ukernel_fn) - convert_op->unary_elementwise_config->ukernel, - }; - - // TODO(b/340399245) - Ideally, this should parallelize along `batch` in - // groups of `mr`. - convert_op->compute[0].type = xnn_parallelization_type_1d; - convert_op->compute[0].task_1d = - (pthreadpool_task_1d_t)xnn_compute_f32_qp8_convert; - convert_op->compute[0].range[0] = batch_size; - - convert_op->state = xnn_run_state_needs_setup; - - return xnn_status_success; -} - -enum xnn_status xnn_reshape_convert_nc_f32_qs8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f32_qs8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_INT8_T, - &convert_op->params.f32_qs8_cvt, sizeof(convert_op->params.f32_qs8_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_f32_qu8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f32_qu8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT8_T, - &convert_op->params.f32_qu8_cvt, sizeof(convert_op->params.f32_qu8_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_qs8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_INT8_T, - &convert_op->params.qs8_cvt, sizeof(convert_op->params.qs8_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_qs16_qs8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs16_qs8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT16_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_INT8_T, - &convert_op->params.qs16_qs8_cvt, sizeof(convert_op->params.qs16_qs8_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_qs8_f16( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs8_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &convert_op->params.qs8_f16_cvt, sizeof(convert_op->params.qs8_f16_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_qs8_f32( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs8_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &convert_op->params.qs8_f32_cvt, sizeof(convert_op->params.qs8_f32_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_qu8( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qu8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT8_T, - &convert_op->params.qu8_cvt, sizeof(convert_op->params.qu8_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_convert_nc_qu8_f32( - xnn_operator_t convert_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qu8_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &convert_op->params.qu8_f32_cvt, sizeof(convert_op->params.qu8_f32_cvt), - threadpool); -} - -enum xnn_status xnn_reshape_copy_nc_x8( - xnn_operator_t copy_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - copy_op, xnn_operator_type_copy_nc_x8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT8_T, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_copy_nc_x16( - xnn_operator_t copy_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - copy_op, xnn_operator_type_copy_nc_x16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT16_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT16_T, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_copy_nc_x32( - xnn_operator_t copy_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - copy_op, xnn_operator_type_copy_nc_x32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT32_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT32_T, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_elu_nc_f16( - xnn_operator_t elu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - elu_op, xnn_operator_type_elu_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &elu_op->params.f16_elu, sizeof(elu_op->params.f16_elu), - threadpool); -} - -enum xnn_status xnn_reshape_elu_nc_f32( - xnn_operator_t elu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - elu_op, xnn_operator_type_elu_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &elu_op->params.f32_elu, sizeof(elu_op->params.f32_elu), - threadpool); -} - -enum xnn_status xnn_create_exp_nc_f32( - uint32_t flags, - xnn_operator_t* exp_op_out) -{ - const struct xnn_unary_elementwise_config* f32_exp_config = xnn_init_f32_exp_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY(f32_exp_config != NULL) { - if (f32_exp_config->init.f32_default != NULL) { - f32_exp_config->init.f32_default(¶ms); - } - } - - return create_unary_elementwise_nc( - flags, f32_exp_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), - xnn_operator_type_exp_nc_f32, exp_op_out); -} - -enum xnn_status xnn_reshape_exp_nc_f32( - xnn_operator_t exp_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - exp_op, xnn_operator_type_exp_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &exp_op->params.f32_default, sizeof(exp_op->params.f32_default), - threadpool); -} - -enum xnn_status xnn_setup_exp_nc_f32( - xnn_operator_t exp_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - exp_op, xnn_operator_type_exp_nc_f32, - input, output); -} - -enum xnn_status xnn_reshape_floor_nc_f16( - xnn_operator_t floor_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - floor_op, xnn_operator_type_floor_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_floor_nc_f32( - xnn_operator_t floor_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - floor_op, xnn_operator_type_floor_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &floor_op->params.f32_rnd, sizeof(floor_op->params.f32_rnd), - threadpool); -} - -enum xnn_status xnn_reshape_gelu_nc_f32(xnn_operator_t gelu_op, - size_t batch_size, size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) { - return reshape_unary_elementwise_nc( - gelu_op, xnn_operator_type_gelu_nc_f32, batch_size, channels, - input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, &gelu_op->params.f32_default, - sizeof(gelu_op->params.f32_default), threadpool); -} - -enum xnn_status xnn_reshape_hardswish_nc_f16( - xnn_operator_t hardswish_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - hardswish_op, xnn_operator_type_hardswish_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &hardswish_op->params.f16_hswish, sizeof(hardswish_op->params.f16_hswish), - threadpool); -} - -enum xnn_status xnn_reshape_hardswish_nc_f32( - xnn_operator_t hardswish_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - hardswish_op, xnn_operator_type_hardswish_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &hardswish_op->params.f32_hswish, sizeof(hardswish_op->params.f32_hswish), - threadpool); -} - -enum xnn_status xnn_reshape_leaky_relu_nc_f16( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &leaky_relu_op->params.f16_lrelu, sizeof(leaky_relu_op->params.f16_lrelu), - threadpool); -} - -enum xnn_status xnn_reshape_leaky_relu_nc_f32( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &leaky_relu_op->params.f32_lrelu, sizeof(leaky_relu_op->params.f32_lrelu), - threadpool); -} - -enum xnn_status xnn_reshape_leaky_relu_nc_qs8( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_qs8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_INT8_T, - &leaky_relu_op->params.qs8_lrelu, sizeof(leaky_relu_op->params.qs8_lrelu), - threadpool); -} - -enum xnn_status xnn_reshape_leaky_relu_nc_qu8( - xnn_operator_t leaky_relu_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_qu8, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT8_T, - &leaky_relu_op->params.qu8_lrelu, sizeof(leaky_relu_op->params.qu8_lrelu), - threadpool); -} - -enum xnn_status xnn_reshape_negate_nc_f16( - xnn_operator_t negate_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - negate_op, xnn_operator_type_negate_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &negate_op->params.f16_default, sizeof(negate_op->params.f16_default), - threadpool); -} - -enum xnn_status xnn_reshape_negate_nc_f32( - xnn_operator_t negate_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - negate_op, xnn_operator_type_negate_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &negate_op->params.f32_default, sizeof(negate_op->params.f32_default), - threadpool); -} - -enum xnn_status xnn_reshape_reciprocal_square_root_nc_f16( - xnn_operator_t rsqrt_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - rsqrt_op, xnn_operator_type_reciprocal_square_root_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_reciprocal_square_root_nc_f32( - xnn_operator_t rsqrt_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - rsqrt_op, xnn_operator_type_reciprocal_square_root_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &rsqrt_op->params.f32_rsqrt, sizeof(rsqrt_op->params.f32_rsqrt), - threadpool); -} - -enum xnn_status xnn_reshape_sigmoid_nc_f16( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &sigmoid_op->params.f16_sigmoid, sizeof(sigmoid_op->params.f16_sigmoid), - threadpool); -} - -enum xnn_status xnn_reshape_sigmoid_nc_f32( - xnn_operator_t sigmoid_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &sigmoid_op->params.f32_sigmoid, sizeof(sigmoid_op->params.f32_sigmoid), - threadpool); -} - -enum xnn_status xnn_reshape_square_nc_f16( - xnn_operator_t square_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - square_op, xnn_operator_type_square_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_square_nc_f32( - xnn_operator_t square_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - square_op, xnn_operator_type_square_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &square_op->params.f32_default, sizeof(square_op->params.f32_default), - threadpool); -} - -enum xnn_status xnn_reshape_square_root_nc_f16( - xnn_operator_t sqrt_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - sqrt_op, xnn_operator_type_square_root_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_square_root_nc_f32( - xnn_operator_t sqrt_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - sqrt_op, xnn_operator_type_square_root_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &sqrt_op->params.f32_sqrt, sizeof(sqrt_op->params.f32_sqrt), - threadpool); -} - -enum xnn_status xnn_reshape_tanh_nc_f16( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &tanh_op->params.f16_tanh, sizeof(tanh_op->params.f16_tanh), - threadpool); -} - -enum xnn_status xnn_reshape_tanh_nc_f32( - xnn_operator_t tanh_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &tanh_op->params.f32_tanh, sizeof(tanh_op->params.f32_tanh), - threadpool); -} - -enum xnn_status xnn_reshape_truncation_nc_f16( - xnn_operator_t truncation_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - truncation_op, xnn_operator_type_truncation_nc_f16, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - /*params=*/NULL, /*params_size=*/0, - threadpool); -} - -enum xnn_status xnn_reshape_truncation_nc_f32( - xnn_operator_t truncation_op, - size_t batch_size, - size_t channels, - size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) -{ - return reshape_unary_elementwise_nc( - truncation_op, xnn_operator_type_truncation_nc_f32, - batch_size, - channels, input_stride, output_stride, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &truncation_op->params.f32_rnd, sizeof(truncation_op->params.f32_rnd), - threadpool); -} - -enum xnn_status xnn_setup_abs_nc_f16( - xnn_operator_t abs_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - abs_op, xnn_operator_type_abs_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_abs_nc_f32( - xnn_operator_t abs_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - abs_op, xnn_operator_type_abs_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_bankers_rounding_nc_f16( - xnn_operator_t rounding_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - rounding_op, xnn_operator_type_bankers_rounding_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_bankers_rounding_nc_f32( - xnn_operator_t rounding_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - rounding_op, xnn_operator_type_bankers_rounding_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_ceiling_nc_f16( - xnn_operator_t ceiling_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - ceiling_op, xnn_operator_type_ceiling_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_ceiling_nc_f32( - xnn_operator_t ceiling_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - ceiling_op, xnn_operator_type_ceiling_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_clamp_nc_f16( - xnn_operator_t clamp_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_clamp_nc_f32( - xnn_operator_t clamp_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_clamp_nc_s8( - xnn_operator_t clamp_op, - const int8_t* input, - int8_t* output) -{ - return setup_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_s8, - input, output); -} - -enum xnn_status xnn_setup_clamp_nc_u8( - xnn_operator_t clamp_op, - const uint8_t* input, - uint8_t* output) -{ - return setup_unary_elementwise_nc( - clamp_op, xnn_operator_type_clamp_nc_u8, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_f16_f32( - xnn_operator_t convert_op, - const void* input, - float* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f16_f32, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_f32_f16( - xnn_operator_t convert_op, - const float* input, - void* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f32_f16, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_f16_qd8( - xnn_operator_t convert_op, - const void* input, - int8_t* output, - struct xnn_quantization_params* quantization_params) -{ - if (convert_op->type != xnn_operator_type_convert_nc_f16_qd8) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f16_qd8), - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_parameter; - } - - switch (convert_op->state) { - case xnn_run_state_skip: - return xnn_status_success; - case xnn_run_state_invalid: - xnn_log_error( - "failed to setup %s operator: operator has not been reshaped yet", - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_state; - case xnn_run_state_needs_setup: - // Operator has been reshaped, but not setup, continue with setup. - case xnn_run_state_ready: - // Operator has been reshaped, and we are setting up with different pointers. - break; - } - - convert_op->context.f16_qd8_convert.x = input; - convert_op->context.f16_qd8_convert.y = output; - convert_op->context.f16_qd8_convert.quantization_params = (struct xnn_qd8_quantization_params*) quantization_params; - convert_op->state = xnn_run_state_ready; - - return xnn_status_success; -} - -enum xnn_status xnn_setup_convert_nc_f32_qd8( - xnn_operator_t convert_op, - const float* input, - int8_t* output, - struct xnn_quantization_params* quantization_params) -{ - if (convert_op->type != xnn_operator_type_convert_nc_f32_qd8) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qd8), - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_parameter; - } - - switch (convert_op->state) { - case xnn_run_state_skip: - return xnn_status_success; - case xnn_run_state_invalid: - xnn_log_error( - "failed to setup %s operator: operator has not been reshaped yet", - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_state; - case xnn_run_state_needs_setup: - // Operator has been reshaped, but not setup, continue with setup. - case xnn_run_state_ready: - // Operator has been reshaped, and we are setting up with different pointers. - break; - } - - convert_op->context.f32_qd8_convert.x = input; - convert_op->context.f32_qd8_convert.y = output; - convert_op->context.f32_qd8_convert.quantization_params = (struct xnn_qd8_quantization_params*) quantization_params; - convert_op->state = xnn_run_state_ready; - - return xnn_status_success; -} - -enum xnn_status xnn_setup_convert_nc_f32_qp8(xnn_operator_t convert_op, - const float* input, - int8_t* output) { - xnn_status_t status = - check_op_type(convert_op, xnn_operator_type_convert_nc_f32_qp8); - if (status != xnn_status_success) { - return status; - } - - switch (convert_op->state) { - case xnn_run_state_skip: - return xnn_status_success; - case xnn_run_state_invalid: - xnn_log_error( - "failed to setup %s operator: operator has not been reshaped yet", - xnn_operator_type_to_string(convert_op->type)); - return xnn_status_invalid_state; - case xnn_run_state_needs_setup: - // Operator has been reshaped, but not setup, continue with setup. - case xnn_run_state_ready: - // Operator has been reshaped, and we are setting up with different - // pointers. - break; - } - - convert_op->context.f32_qp8_convert.lhs = input; - convert_op->context.f32_qp8_convert.lhs_packed = output; - convert_op->state = xnn_run_state_ready; - - return xnn_status_success; -} - -enum xnn_status xnn_setup_convert_nc_f32_qs8( - xnn_operator_t convert_op, - const float* input, - int8_t* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f32_qs8, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_f32_qu8( - xnn_operator_t convert_op, - const float* input, - uint8_t* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_f32_qu8, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_qs8( - xnn_operator_t convert_op, - const int8_t* input, - int8_t* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs8, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_qs16_qs8( - xnn_operator_t convert_op, - const int16_t* input, - int8_t* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs16_qs8, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_qs8_f16( - xnn_operator_t convert_op, - const int8_t* input, - void* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs8_f16, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_qs8_f32( - xnn_operator_t convert_op, - const int8_t* input, - float* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qs8_f32, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_qu8( - xnn_operator_t convert_op, - const uint8_t* input, - uint8_t* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qu8, - input, output); -} - -enum xnn_status xnn_setup_convert_nc_qu8_f32( - xnn_operator_t convert_op, - const uint8_t* input, - float* output) -{ - return setup_unary_elementwise_nc( - convert_op, xnn_operator_type_convert_nc_qu8_f32, - input, output); -} - -enum xnn_status xnn_setup_copy_nc_x8( - xnn_operator_t copy_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - copy_op, xnn_operator_type_copy_nc_x8, - input, output); -} - -enum xnn_status xnn_setup_copy_nc_x16( - xnn_operator_t copy_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - copy_op, xnn_operator_type_copy_nc_x16, - input, output); -} - -enum xnn_status xnn_setup_copy_nc_x32( - xnn_operator_t copy_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - copy_op, xnn_operator_type_copy_nc_x32, - input, output); -} - -enum xnn_status xnn_setup_elu_nc_f16( - xnn_operator_t elu_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - elu_op, xnn_operator_type_elu_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_elu_nc_f32( - xnn_operator_t elu_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - elu_op, xnn_operator_type_elu_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_floor_nc_f16( - xnn_operator_t floor_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - floor_op, xnn_operator_type_floor_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_floor_nc_f32( - xnn_operator_t floor_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - floor_op, xnn_operator_type_floor_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_gelu_nc_f32(xnn_operator_t gelu_op, - const float* input, float* output) { - return setup_unary_elementwise_nc(gelu_op, xnn_operator_type_gelu_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_hardswish_nc_f16( - xnn_operator_t hardswish_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - hardswish_op, xnn_operator_type_hardswish_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_hardswish_nc_f32( - xnn_operator_t hardswish_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - hardswish_op, xnn_operator_type_hardswish_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_leaky_relu_nc_f16( - xnn_operator_t leaky_relu_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_leaky_relu_nc_f32( - xnn_operator_t leaky_relu_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_leaky_relu_nc_qs8( - xnn_operator_t leaky_relu_op, - const int8_t* input, - int8_t* output) -{ - return setup_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_qs8, - input, output); -} - -enum xnn_status xnn_setup_leaky_relu_nc_qu8( - xnn_operator_t leaky_relu_op, - const uint8_t* input, - uint8_t* output) -{ - return setup_unary_elementwise_nc( - leaky_relu_op, xnn_operator_type_leaky_relu_nc_qu8, - input, output); -} - -enum xnn_status xnn_setup_negate_nc_f16( - xnn_operator_t negate_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - negate_op, xnn_operator_type_negate_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_negate_nc_f32( - xnn_operator_t negate_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - negate_op, xnn_operator_type_negate_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_reciprocal_square_root_nc_f16( - xnn_operator_t rsqrt_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - rsqrt_op, xnn_operator_type_reciprocal_square_root_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_reciprocal_square_root_nc_f32( - xnn_operator_t rsqrt_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - rsqrt_op, xnn_operator_type_reciprocal_square_root_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_sigmoid_nc_f16( - xnn_operator_t sigmoid_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_sigmoid_nc_f32( - xnn_operator_t sigmoid_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - sigmoid_op, xnn_operator_type_sigmoid_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_square_nc_f16( - xnn_operator_t square_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - square_op, xnn_operator_type_square_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_square_nc_f32( - xnn_operator_t square_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - square_op, xnn_operator_type_square_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_square_root_nc_f16( - xnn_operator_t sqrt_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - sqrt_op, xnn_operator_type_square_root_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_square_root_nc_f32( - xnn_operator_t sqrt_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - sqrt_op, xnn_operator_type_square_root_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_tanh_nc_f16( - xnn_operator_t tanh_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_tanh_nc_f32( - xnn_operator_t tanh_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - tanh_op, xnn_operator_type_tanh_nc_f32, - input, output); -} - -enum xnn_status xnn_setup_truncation_nc_f16( - xnn_operator_t truncation_op, - const void* input, - void* output) -{ - return setup_unary_elementwise_nc( - truncation_op, xnn_operator_type_truncation_nc_f16, - input, output); -} - -enum xnn_status xnn_setup_truncation_nc_f32( - xnn_operator_t truncation_op, - const float* input, - float* output) -{ - return setup_unary_elementwise_nc( - truncation_op, xnn_operator_type_truncation_nc_f32, - input, output); -} - -static enum xnn_status run_unary_elementwise_nc( - enum xnn_operator_type operator_type, - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const void* input, - void* output, - const struct xnn_unary_elementwise_config* unary_elementwise_config, - const void* params, - size_t params_size, - uint32_t log2_input_size, - uint32_t log2_output_size, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (unary_elementwise_config == NULL) { - xnn_log_error( - "failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(operator_type)); - return xnn_status_unsupported_hardware; - } - - if (channels == 0) { - xnn_log_error( - "failed to run %s operator with %zu channels: number of channels must be non-zero", - xnn_operator_type_to_string(operator_type), channels); - return xnn_status_invalid_parameter; - } - - if (input_stride < channels) { - xnn_log_error( - "failed to run %s operator with input element stride of %zu: " - "stride must be at least as large as the number of channels (%zu)", - xnn_operator_type_to_string(operator_type), input_stride, channels); - return xnn_status_invalid_parameter; - } - - if (output_stride < channels) { - xnn_log_error( - "failed to run %s operator with output element stride of %zu: " - "stride must be at least as large as the number of channels (%zu)", - xnn_operator_type_to_string(operator_type), output_stride, channels); - return xnn_status_invalid_parameter; - } - - struct xnn_operator unary_elementwise_op; - memset(&unary_elementwise_op, 0, sizeof(unary_elementwise_op)); - - init_unary_elementwise_nc( - flags, /*params=*/NULL, /*params_size=*/0, - operator_type, unary_elementwise_config, /*rminmax_config=*/NULL, &unary_elementwise_op); - - enum xnn_status status = reshape_unary_elementwise_nc( - &unary_elementwise_op, operator_type, - batch_size, channels, input_stride, output_stride, - log2_input_size, log2_output_size, - params, params_size, - threadpool); - if (status != xnn_status_success){ - return status; - } - - status = setup_unary_elementwise_nc(&unary_elementwise_op, operator_type, input, output); - if (status != xnn_status_success){ - return status; - } - - return xnn_run_operator(&unary_elementwise_op, threadpool); -} - -enum xnn_status xnn_run_abs_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_abs_config = xnn_init_f32_abs_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY(f32_abs_config != NULL && f32_abs_config->init.f32_default != NULL) { - f32_abs_config->init.f32_default(¶ms); - } - - return run_unary_elementwise_nc( - xnn_operator_type_abs_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_abs_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_bankers_rounding_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_rndne_config = xnn_init_f32_rndne_config(); - - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndne_config != NULL && f32_rndne_config->init.f32_rnd != NULL) { - f32_rndne_config->init.f32_rnd(¶ms); - } - - return run_unary_elementwise_nc( - xnn_operator_type_bankers_rounding_nc_f32, - channels, - input_stride, output_stride, - batch_size, - input, output, - f32_rndne_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_ceiling_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_rndu_config = xnn_init_f32_rndu_config(); - - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndu_config != NULL && f32_rndu_config->init.f32_rnd != NULL) { - f32_rndu_config->init.f32_rnd(¶ms); - } - - return run_unary_elementwise_nc( - xnn_operator_type_ceiling_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_rndu_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_clamp_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - float output_min, - float output_max, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (isnan(output_min)) { - xnn_log_error( - "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32)); - return xnn_status_invalid_parameter; - } - - if (isnan(output_max)) { - xnn_log_error( - "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32)); - return xnn_status_invalid_parameter; - } - - if (output_min > output_max) { - xnn_log_error( - "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be less than or equal to upper bound", - xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), output_min, output_max); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f32_clamp_config = xnn_init_f32_clamp_config(); - const struct xnn_unary_elementwise_config* f32_relu_config = xnn_init_f32_relu_config(); - - const struct xnn_unary_elementwise_config* config = f32_clamp_config; - const bool relu_activation = (output_max == INFINITY) && (output_min == 0.0f); - if (relu_activation && f32_relu_config->ukernel != NULL) { - config = f32_relu_config; - } - - union xnn_f32_minmax_params params; - if XNN_LIKELY(f32_clamp_config != NULL) { - assert(f32_clamp_config->init.f32_minmax != NULL); - f32_clamp_config->init.f32_minmax(¶ms, output_min, output_max); - } - - return run_unary_elementwise_nc( - xnn_operator_type_clamp_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_convert_nc_f16_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const void* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f16_to_f32_cvt_config = xnn_init_f16_to_f32_cvt_config(); - - return run_unary_elementwise_nc( - xnn_operator_type_convert_nc_f16_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f16_to_f32_cvt_config, /*params=*/NULL, /*params_size=*/0, - /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_convert_nc_f32_f16( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - void* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_to_f16_cvt_config = xnn_init_f32_to_f16_cvt_config(); - - return run_unary_elementwise_nc( - xnn_operator_type_convert_nc_f32_f16, - channels, input_stride, output_stride, batch_size, - input, output, - f32_to_f16_cvt_config, /*params=*/NULL, /*params_size=*/0, - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - flags, - threadpool); -} - -enum xnn_status xnn_run_convert_nc_f32_qs8( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - int8_t* output, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f32_to_qs8_cvt_config = xnn_init_f32_to_qs8_cvt_config(); - - struct xnn_f32_qs8_cvt_params params; - if XNN_LIKELY(f32_to_qs8_cvt_config != NULL) { - assert(f32_to_qs8_cvt_config->init.f32_qs8_cvt != NULL); - f32_to_qs8_cvt_config->init.f32_qs8_cvt(¶ms, 1.0f / output_scale, output_zero_point); - } - - return run_unary_elementwise_nc( - xnn_operator_type_convert_nc_f32_qs8, - channels, input_stride, output_stride, batch_size, - input, output, - f32_to_qs8_cvt_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_INT8_T, - flags, - threadpool); -} - -enum xnn_status xnn_run_convert_nc_f32_qu8( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - uint8_t* output, - float output_scale, - uint8_t output_zero_point, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_qu8), output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* f32_to_qu8_cvt_config = xnn_init_f32_to_qu8_cvt_config(); - - struct xnn_f32_qu8_cvt_params params; - if XNN_LIKELY(f32_to_qu8_cvt_config != NULL) { - assert(f32_to_qu8_cvt_config->init.f32_qu8_cvt != NULL); - f32_to_qu8_cvt_config->init.f32_qu8_cvt(¶ms, 1.0f / output_scale, output_zero_point); - } - - return run_unary_elementwise_nc( - xnn_operator_type_convert_nc_f32_qu8, - channels, input_stride, output_stride, batch_size, - input, output, - f32_to_qu8_cvt_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT8_T, - flags, - threadpool); -} - -enum xnn_status xnn_run_convert_nc_qs8_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const int8_t* input, - float* output, - float input_scale, - int8_t input_zero_point, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs8_f32), input_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qs8_to_f32_cvt_config = xnn_init_qs8_to_f32_cvt_config(); - - struct xnn_qs8_f32_cvt_params params; - if XNN_LIKELY(qs8_to_f32_cvt_config != NULL) { - assert(qs8_to_f32_cvt_config->init.qs8_f32_cvt != NULL); - qs8_to_f32_cvt_config->init.qs8_f32_cvt(¶ms, input_scale, input_zero_point); - } - - return run_unary_elementwise_nc( - xnn_operator_type_convert_nc_qs8_f32, - channels, input_stride, output_stride, batch_size, - input, output, - qs8_to_f32_cvt_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_convert_nc_qs16_qs8( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const int16_t* input, - int8_t* output, - float input_scale, - float output_scale, - int8_t output_zero_point, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs16_qs8), input_scale); - return xnn_status_invalid_parameter; - } - - if (output_scale <= 0.0f || !isnormal(output_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g output scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs16_qs8), output_scale); - return xnn_status_invalid_parameter; - } - - const float input_output_scale = input_scale / output_scale; - if (input_output_scale < 0x1.0p-16f || input_output_scale > 0x1.0p+8f) { - xnn_log_error( - "failed to create %s operator with %.7g input-to-output scale ratio: scale ratio must be in [2**-16, 2**8] range", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qs16_qs8), input_output_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qs16_to_qs8_cvt_config = xnn_init_qs16_to_qs8_cvt_config(); - assert(qs16_to_qs8_cvt_config != NULL); - - struct xnn_qs16_qs8_cvt_params params; - assert(qs16_to_qs8_cvt_config->init.qs16_qs8_cvt != NULL); - qs16_to_qs8_cvt_config->init.qs16_qs8_cvt(¶ms, input_output_scale, output_zero_point); - - return run_unary_elementwise_nc( - xnn_operator_type_convert_nc_qs16_qs8, - channels, input_stride, output_stride, batch_size, - input, output, - qs16_to_qs8_cvt_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_INT16_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_INT8_T, - flags, - threadpool); -} - -enum xnn_status xnn_run_convert_nc_qu8_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const uint8_t* input, - float* output, - float input_scale, - uint8_t input_zero_point, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (input_scale <= 0.0f || !isnormal(input_scale)) { - xnn_log_error( - "failed to create %s operator with %.7g input scale parameter: scale must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_convert_nc_qu8_f32), input_scale); - return xnn_status_invalid_parameter; - } - - const struct xnn_unary_elementwise_config* qu8_to_f32_cvt_config = xnn_init_qu8_to_f32_cvt_config(); - - struct xnn_qu8_f32_cvt_params params; - if XNN_LIKELY(qu8_to_f32_cvt_config != NULL) { - assert(qu8_to_f32_cvt_config->init.qu8_f32_cvt != NULL); - qu8_to_f32_cvt_config->init.qu8_f32_cvt(¶ms, input_scale, input_zero_point); - } - - return run_unary_elementwise_nc( - xnn_operator_type_convert_nc_qu8_f32, - channels, input_stride, output_stride, batch_size, - input, output, - qu8_to_f32_cvt_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT8_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_copy_nc_x32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const uint32_t* input, - uint32_t* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - return run_unary_elementwise_nc( - xnn_operator_type_copy_nc_x32, - channels, input_stride, output_stride, batch_size, - input, output, - xnn_init_xx_copy_config(), NULL, 0, - /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT32_T, - /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT32_T, - flags, - threadpool); -} - -enum xnn_status xnn_run_elu_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - float alpha, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (alpha <= 0.0f || !isnormal(alpha)) { - xnn_log_error( - "failed to create %s operator with %.7g alpha parameter: alpha must be finite, normalized, and positive", - xnn_operator_type_to_string(xnn_operator_type_elu_nc_f32), alpha); - return xnn_status_invalid_parameter; +enum xnn_status xnn_setup_convert_nc_f32_qp8(xnn_operator_t convert_op, + const float* input, + int8_t* output) { + xnn_status_t status = + check_op_type(convert_op, xnn_operator_type_convert_nc_f32_qp8); + if (status != xnn_status_success) { + return status; } - const struct xnn_unary_elementwise_config* f32_elu_config = xnn_init_f32_elu_config(); - - struct xnn_f32_elu_params params; - if XNN_LIKELY(f32_elu_config != NULL) { - assert(f32_elu_config->init.f32_elu != NULL); - f32_elu_config->init.f32_elu(¶ms, /*prescale=*/1.0f, alpha, /*beta=*/1.0f); + switch (convert_op->state) { + case xnn_run_state_skip: + return xnn_status_success; + case xnn_run_state_invalid: + xnn_log_error( + "failed to setup %s operator: operator has not been reshaped yet", + xnn_operator_type_to_string(convert_op->type)); + return xnn_status_invalid_state; + case xnn_run_state_needs_setup: + // Operator has been reshaped, but not setup, continue with setup. + case xnn_run_state_ready: + // Operator has been reshaped, and we are setting up with different + // pointers. + break; } - return run_unary_elementwise_nc( - xnn_operator_type_elu_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_elu_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); + convert_op->context.f32_qp8_convert.lhs = input; + convert_op->context.f32_qp8_convert.lhs_packed = output; + convert_op->state = xnn_run_state_ready; + + return xnn_status_success; } -enum xnn_status xnn_run_floor_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) +enum xnn_status xnn_setup_copy_nc_x8( + xnn_operator_t copy_op, + const void* input, + void* output) { - const struct xnn_unary_elementwise_config* f32_rndd_config = xnn_init_f32_rndd_config(); - - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndd_config != NULL && f32_rndd_config->init.f32_rnd != NULL) { - f32_rndd_config->init.f32_rnd(¶ms); - } - - return run_unary_elementwise_nc( - xnn_operator_type_floor_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_rndd_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); + return setup_unary_elementwise_nc( + copy_op, xnn_operator_type_copy_nc_x8, + input, output); } -enum xnn_status xnn_run_gelu_nc_f32(size_t channels, size_t input_stride, - size_t output_stride, size_t batch_size, - const float* input, float* output, - uint32_t flags, pthreadpool_t threadpool) { - const struct xnn_unary_elementwise_config* f32_gelu_config = - xnn_init_f32_gelu_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY (f32_gelu_config != NULL) { - if (f32_gelu_config->init.f32_default != NULL) { - f32_gelu_config->init.f32_default(¶ms); - } - } +enum xnn_status xnn_setup_copy_nc_x16( + xnn_operator_t copy_op, + const void* input, + void* output) +{ + return setup_unary_elementwise_nc( + copy_op, xnn_operator_type_copy_nc_x16, + input, output); +} - return run_unary_elementwise_nc( - xnn_operator_type_gelu_nc_f32, channels, input_stride, output_stride, - batch_size, input, output, f32_gelu_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, flags, threadpool); +enum xnn_status xnn_setup_copy_nc_x32( + xnn_operator_t copy_op, + const void* input, + void* output) +{ + return setup_unary_elementwise_nc( + copy_op, xnn_operator_type_copy_nc_x32, + input, output); } -enum xnn_status xnn_run_hardswish_nc_f32( +static enum xnn_status run_unary_elementwise_nc( + enum xnn_operator_type operator_type, size_t channels, size_t input_stride, size_t output_stride, size_t batch_size, - const float* input, - float* output, + const void* input, + void* output, + const struct xnn_unary_elementwise_config* unary_elementwise_config, + const void* params, + size_t params_size, + uint32_t log2_input_size, + uint32_t log2_output_size, uint32_t flags, pthreadpool_t threadpool) { - const struct xnn_unary_elementwise_config* f32_hswish_config = xnn_init_f32_hswish_config(); - - struct xnn_f32_hswish_params params; - if XNN_LIKELY(f32_hswish_config != NULL && f32_hswish_config->init.f32_hswish != NULL) { - f32_hswish_config->init.f32_hswish(¶ms); + if (unary_elementwise_config == NULL) { + xnn_log_error( + "failed to create %s operator: unsupported hardware configuration", + xnn_operator_type_to_string(operator_type)); + return xnn_status_unsupported_hardware; } - return run_unary_elementwise_nc( - xnn_operator_type_hardswish_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_hswish_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_leaky_relu_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - float negative_slope, - uint32_t flags, - pthreadpool_t threadpool) -{ - if (!isfinite(negative_slope)) { + if (channels == 0) { xnn_log_error( - "failed to create %s operator with %f negative slope: finite number expected", - xnn_operator_type_to_string(xnn_operator_type_leaky_relu_nc_f32), - negative_slope); + "failed to run %s operator with %zu channels: number of channels must be non-zero", + xnn_operator_type_to_string(operator_type), channels); return xnn_status_invalid_parameter; } - const struct xnn_unary_elementwise_config* f32_lrelu_config = xnn_init_f32_lrelu_config(); - - struct xnn_f32_lrelu_params params; - if XNN_LIKELY(f32_lrelu_config != NULL) { - assert(f32_lrelu_config->init.f32_lrelu != NULL); - f32_lrelu_config->init.f32_lrelu(¶ms, negative_slope); - } - - return run_unary_elementwise_nc( - xnn_operator_type_leaky_relu_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_lrelu_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_negate_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_neg_config = xnn_init_f32_neg_config(); - - struct xnn_f32_default_params params; - if XNN_LIKELY(f32_neg_config != NULL && f32_neg_config->init.f32_default != NULL) { - f32_neg_config->init.f32_default(¶ms); - } - - return run_unary_elementwise_nc( - xnn_operator_type_negate_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_neg_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_reciprocal_square_root_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_rsqrt_config = xnn_init_f32_rsqrt_config(); - - struct xnn_f32_rsqrt_params params; - if XNN_LIKELY(f32_rsqrt_config != NULL && f32_rsqrt_config->init.f32_rsqrt != NULL) { - f32_rsqrt_config->init.f32_rsqrt(¶ms); + if (input_stride < channels) { + xnn_log_error( + "failed to run %s operator with input element stride of %zu: " + "stride must be at least as large as the number of channels (%zu)", + xnn_operator_type_to_string(operator_type), input_stride, channels); + return xnn_status_invalid_parameter; } - return run_unary_elementwise_nc( - xnn_operator_type_reciprocal_square_root_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_rsqrt_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_sigmoid_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_sigmoid_config = xnn_init_f32_sigmoid_config(); - - struct xnn_f32_sigmoid_params params; - if XNN_LIKELY(f32_sigmoid_config != NULL && f32_sigmoid_config->init.f32_sigmoid != NULL) { - f32_sigmoid_config->init.f32_sigmoid(¶ms); + if (output_stride < channels) { + xnn_log_error( + "failed to run %s operator with output element stride of %zu: " + "stride must be at least as large as the number of channels (%zu)", + xnn_operator_type_to_string(operator_type), output_stride, channels); + return xnn_status_invalid_parameter; } - return run_unary_elementwise_nc( - xnn_operator_type_sigmoid_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_sigmoid_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_square_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_sqr_config = xnn_init_f32_sqr_config(); + struct xnn_operator unary_elementwise_op; + memset(&unary_elementwise_op, 0, sizeof(unary_elementwise_op)); - struct xnn_f32_default_params params; - if XNN_LIKELY(f32_sqr_config != NULL && f32_sqr_config->init.f32_default != NULL) { - f32_sqr_config->init.f32_default(¶ms); - } + init_unary_elementwise_nc( + flags, /*params=*/NULL, /*params_size=*/0, + operator_type, unary_elementwise_config, &unary_elementwise_op); - return run_unary_elementwise_nc( - xnn_operator_type_square_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_sqr_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, + enum xnn_status status = reshape_unary_elementwise_nc( + &unary_elementwise_op, operator_type, + batch_size, channels, input_stride, output_stride, + log2_input_size, log2_output_size, + params, params_size, threadpool); -} - -enum xnn_status xnn_run_square_root_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_sqrt_config = xnn_init_f32_sqrt_config(); - - struct xnn_f32_sqrt_params params; - if XNN_LIKELY(f32_sqrt_config != NULL && f32_sqrt_config->init.f32_sqrt != NULL) { - f32_sqrt_config->init.f32_sqrt(¶ms); + if (status != xnn_status_success){ + return status; } - return run_unary_elementwise_nc( - xnn_operator_type_square_root_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_sqrt_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); -} - -enum xnn_status xnn_run_tanh_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) -{ - const struct xnn_unary_elementwise_config* f32_tanh_config = xnn_init_f32_tanh_config(); - - union xnn_f32_tanh_params params; - if XNN_LIKELY(f32_tanh_config != NULL && f32_tanh_config->init.f32_tanh != NULL) { - f32_tanh_config->init.f32_tanh(¶ms); + status = setup_unary_elementwise_nc(&unary_elementwise_op, operator_type, input, output); + if (status != xnn_status_success){ + return status; } - return run_unary_elementwise_nc( - xnn_operator_type_tanh_nc_f32, - channels, input_stride, output_stride, batch_size, - input, output, - f32_tanh_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - flags, - threadpool); + return xnn_run_operator(&unary_elementwise_op, threadpool); } -enum xnn_status xnn_run_truncation_nc_f32( - size_t channels, - size_t input_stride, - size_t output_stride, - size_t batch_size, - const float* input, - float* output, - uint32_t flags, - pthreadpool_t threadpool) +enum xnn_status xnn_run_copy_nc_x32( + size_t channels, + size_t input_stride, + size_t output_stride, + size_t batch_size, + const uint32_t* input, + uint32_t* output, + uint32_t flags, + pthreadpool_t threadpool) { - const struct xnn_unary_elementwise_config* f32_rndz_config = xnn_init_f32_rndz_config(); - - struct xnn_f32_rnd_params params; - if XNN_LIKELY(f32_rndz_config != NULL && f32_rndz_config->init.f32_rnd != NULL) { - f32_rndz_config->init.f32_rnd(¶ms); - } - return run_unary_elementwise_nc( - xnn_operator_type_truncation_nc_f32, + xnn_operator_type_copy_nc_x32, channels, input_stride, output_stride, batch_size, input, output, - f32_rndz_config, ¶ms, sizeof(params), - /*log2_input_size=*/XNN_LOG2_SIZEOF_FLOAT, - /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, + xnn_init_xx_copy_config(), NULL, 0, + /*log2_input_size=*/XNN_LOG2_SIZEOF_UINT32_T, + /*log2_output_size=*/XNN_LOG2_SIZEOF_UINT32_T, flags, threadpool); } diff --git a/src/s8-vclamp/s8-vclamp.h b/src/s8-vclamp/s8-vclamp.h index a7aa50b642fd..6f7b83c46040 100644 --- a/src/s8-vclamp/s8-vclamp.h +++ b/src/s8-vclamp/s8-vclamp.h @@ -17,31 +17,31 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_s8_vclamp_ukernel__neon_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_s8_vclamp_ukernel__neon_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__sse2_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_s8_vclamp_ukernel__sse41_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s8_vclamp_ukernel__avx2_u128, 128, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__sse2_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_sse4_1, xnn_s8_vclamp_ukernel__sse41_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_s8_vclamp_ukernel__avx2_u128, 128, false, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_s8_vclamp_ukernel__avx512skx_u256, 256, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_s8_vclamp_ukernel__avx512skx_u256, 256, false, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) #endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_s8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) #endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__wasmsimd_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__wasmsimd_u64, 64, false, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__scalar_u4, 4, false, int8_t, struct xnn_s8_minmax_params, xnn_init_s8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_s8_vclamp_ukernel__scalar_u4, 4, false, int8_t, struct xnn_s8_minmax_params, xnn_init_qs8_clamp_scalar_params) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/subgraph.c b/src/subgraph.c index 6e8735ac47e9..2b394f42e06e 100644 --- a/src/subgraph.c +++ b/src/subgraph.c @@ -71,7 +71,10 @@ enum xnn_status xnn_insert_clamp_node(xnn_subgraph_t subgraph, float output_min, node->outputs[0] = new_id; node->activation.output_min = -INFINITY; node->activation.output_max = INFINITY; - return xnn_define_clamp(subgraph, output_min, output_max, new_id, output_id, /*flags=*/0); + union xnn_unary_params params; + params.clamp.min = output_min; + params.clamp.max = output_max; + return xnn_define_unary(subgraph, xnn_unary_clamp, ¶ms, new_id, output_id, /*flags=*/0); } enum xnn_status xnn_insert_pack_lh_node(xnn_subgraph_t subgraph, const struct xnn_value* input, uint32_t input_id, uint32_t *new_id) { @@ -1019,7 +1022,10 @@ bool xnn_subgraph_rewrite_for_fp16(xnn_subgraph_t subgraph) assert(value->datatype == xnn_datatype_fp32); if (xnn_value_is_static(value)) { const size_t num_elements = xnn_shape_multiply_all_dims(&value->shape); - xnn_run_convert_nc_f32_f16(1, 1, 1, num_elements, value->data, value->fp16_temp_data, 0, NULL); + xnn_run_unary_elementwise_nc(xnn_unary_convert, xnn_datatype_fp32, xnn_datatype_fp16, + /*params=*/NULL, /*input_quantization=*/NULL, + /*output_quantization=*/NULL, 0, num_elements, 1, 1, 1, + NULL, value->data, value->fp16_temp_data); // Remember pointer to the original fp32 data, nodes like convolution need fp32 weights/biases. value->fp32_data = value->data; value->data = value->fp16_temp_data; @@ -1210,6 +1216,10 @@ enum xnn_status xnn_subgraph_fusion( math_max_f32(producer->activation.output_min, consumer->activation.output_min); producer->activation.output_max = math_min_f32(producer->activation.output_max, consumer->activation.output_max); + producer->params.unary.clamp.min = + math_max_f32(producer->params.unary.clamp.min, consumer->params.unary.clamp.min); + producer->params.unary.clamp.max = + math_min_f32(producer->params.unary.clamp.max, consumer->params.unary.clamp.max); xnn_node_clear(consumer); xnn_value_clear(value); @@ -1437,6 +1447,93 @@ enum xnn_status xnn_delete_subgraph( return xnn_status_success; } + +enum xnn_unary_operator xnn_node_type_to_unary_operator(enum xnn_node_type node_type) { + switch (node_type) { + case xnn_node_type_abs: + return xnn_unary_abs; + case xnn_node_type_bankers_rounding: + return xnn_unary_bankers_rounding; + case xnn_node_type_ceiling: + return xnn_unary_ceiling; + case xnn_node_type_clamp: + return xnn_unary_clamp; + case xnn_node_type_convert: + return xnn_unary_convert; + case xnn_node_type_elu: + return xnn_unary_elu; + case xnn_node_type_exp: + return xnn_unary_exp; + case xnn_node_type_floor: + return xnn_unary_floor; + case xnn_node_type_gelu: + return xnn_unary_gelu; + case xnn_node_type_hardswish: + return xnn_unary_hardswish; + case xnn_node_type_leaky_relu: + return xnn_unary_leaky_relu; + case xnn_node_type_log: + return xnn_unary_log; + case xnn_node_type_negate: + return xnn_unary_negate; + case xnn_node_type_reciprocal_square_root: + return xnn_unary_reciprocal_square_root; + case xnn_node_type_sigmoid: + return xnn_unary_sigmoid; + case xnn_node_type_square: + return xnn_unary_square; + case xnn_node_type_square_root: + return xnn_unary_square_root; + case xnn_node_type_tanh: + return xnn_unary_tanh; + default: + return xnn_unary_invalid; + } +} + +enum xnn_node_type xnn_unary_operator_to_node_type(enum xnn_unary_operator op) { + switch (op) { + case xnn_unary_abs: + return xnn_node_type_abs; + case xnn_unary_bankers_rounding: + return xnn_node_type_bankers_rounding; + case xnn_unary_ceiling: + return xnn_node_type_ceiling; + case xnn_unary_clamp: + return xnn_node_type_clamp; + case xnn_unary_convert: + return xnn_node_type_convert; + case xnn_unary_elu: + return xnn_node_type_elu; + case xnn_unary_exp: + return xnn_node_type_exp; + case xnn_unary_floor: + return xnn_node_type_floor; + case xnn_unary_gelu: + return xnn_node_type_gelu; + case xnn_unary_hardswish: + return xnn_node_type_hardswish; + case xnn_unary_leaky_relu: + return xnn_node_type_leaky_relu; + case xnn_unary_log: + return xnn_node_type_log; + case xnn_unary_negate: + return xnn_node_type_negate; + case xnn_unary_reciprocal_square_root: + return xnn_node_type_reciprocal_square_root; + case xnn_unary_sigmoid: + return xnn_node_type_sigmoid; + case xnn_unary_square: + return xnn_node_type_square; + case xnn_unary_square_root: + return xnn_node_type_square_root; + case xnn_unary_tanh: + return xnn_node_type_tanh; + default: + return xnn_node_type_invalid; + } +} + enum xnn_node_type xnn_binary_operator_to_node_type(enum xnn_binary_operator type) { switch (type) { diff --git a/src/subgraph/abs.c b/src/subgraph/abs.c deleted file mode 100644 index 8041b57003c9..000000000000 --- a/src/subgraph/abs.c +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_abs_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_abs_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_abs_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_abs_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_abs_nc_f32: - status = xnn_reshape_abs_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_abs_nc_f16: - status = xnn_reshape_abs_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_abs_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_abs_nc_f32: - return xnn_setup_abs_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_abs_nc_f16: - return xnn_setup_abs_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_abs( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_abs)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_abs, input_id, subgraph->num_values)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_abs, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_abs), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_abs, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_abs, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_abs), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_abs; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_abs_operator; - node->reshape = reshape_abs_operator; - node->setup = setup_abs_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/bankers-rounding.c b/src/subgraph/bankers-rounding.c deleted file mode 100644 index 01b174ba2e28..000000000000 --- a/src/subgraph/bankers-rounding.c +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_bankers_rounding_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - - enum xnn_status status; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_bankers_rounding_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_bankers_rounding_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_bankers_rounding_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_bankers_rounding_nc_f32: - status = xnn_reshape_bankers_rounding_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_bankers_rounding_nc_f16: - status = xnn_reshape_bankers_rounding_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_bankers_rounding_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_bankers_rounding_nc_f32: - return xnn_setup_bankers_rounding_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_bankers_rounding_nc_f16: - return xnn_setup_bankers_rounding_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_bankers_rounding( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_bankers_rounding)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_bankers_rounding, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_bankers_rounding, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_bankers_rounding), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_bankers_rounding, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_bankers_rounding, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_bankers_rounding), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_bankers_rounding; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_bankers_rounding_operator; - node->reshape = reshape_bankers_rounding_operator; - node->setup = setup_bankers_rounding_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/ceiling.c b/src/subgraph/ceiling.c deleted file mode 100644 index 9fbbc38c2bb5..000000000000 --- a/src/subgraph/ceiling.c +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_ceiling_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_ceiling_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_ceiling_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_ceiling_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_ceiling_nc_f32: - status = xnn_reshape_ceiling_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_ceiling_nc_f16: - status = xnn_reshape_ceiling_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_ceiling_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_ceiling_nc_f32: - return xnn_setup_ceiling_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_ceiling_nc_f16: - return xnn_setup_ceiling_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_ceiling( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_ceiling)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_ceiling, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_ceiling, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_ceiling), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_ceiling, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_ceiling, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_ceiling), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_ceiling; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_ceiling_operator; - node->reshape = reshape_ceiling_operator; - node->setup = setup_ceiling_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/clamp.c b/src/subgraph/clamp.c deleted file mode 100644 index 63686166003c..000000000000 --- a/src/subgraph/clamp.c +++ /dev/null @@ -1,292 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/requantization.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_clamp_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - enum xnn_status status; - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_clamp_nc_f16( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_clamp_nc_f32( - node->activation.output_min, - node->activation.output_max, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const int8_t output_min = xnn_qs8_quantize(node->activation.output_min, output_scale, output_zero_point); - const int8_t output_max = xnn_qs8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_clamp_nc_s8( - output_min, - output_max, - node->flags, - &opdata->operator_objects[0]); - break; - } - case xnn_datatype_quint8: - { - const float output_scale = values[output_id].quantization.scale; - const int32_t output_zero_point = values[output_id].quantization.zero_point; - const uint8_t output_min = xnn_qu8_quantize(node->activation.output_min, output_scale, output_zero_point); - const uint8_t output_max = xnn_qu8_quantize(node->activation.output_max, output_scale, output_zero_point); - status = xnn_create_clamp_nc_u8( - output_min, - output_max, - node->flags, - &opdata->operator_objects[0]); - break; - } - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_clamp_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_clamp_nc_f16: - status = xnn_reshape_clamp_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_clamp_nc_f32: - status = xnn_reshape_clamp_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_clamp_nc_s8: - status = xnn_reshape_clamp_nc_s8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_clamp_nc_u8: - status = xnn_reshape_clamp_nc_u8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_clamp_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_clamp_nc_f16: - return xnn_setup_clamp_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_clamp_nc_f32: - return xnn_setup_clamp_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_clamp_nc_s8: - return xnn_setup_clamp_nc_s8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_clamp_nc_u8: - return xnn_setup_clamp_nc_u8( - opdata->operator_objects[0], - input_data, - output_data); - break; - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_clamp( - xnn_subgraph_t subgraph, - float output_min, - float output_max, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_clamp)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_clamp, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_clamp, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_clamp), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_clamp, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_clamp, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - case xnn_datatype_quint8: - compute_type = xnn_compute_type_qu8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_clamp), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - assert(compute_type != xnn_compute_type_invalid); - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_clamp, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - status = xnn_subgraph_check_quantization_parameter_matches( - xnn_node_type_clamp, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_clamp; - node->compute_type = compute_type; - node->activation.output_min = output_min; - node->activation.output_max = output_max; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_clamp_operator; - node->reshape = reshape_clamp_operator; - node->setup = setup_clamp_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/deprecated.c b/src/subgraph/deprecated.c index a86e5a682062..601793b4655b 100644 --- a/src/subgraph/deprecated.c +++ b/src/subgraph/deprecated.c @@ -212,3 +212,126 @@ enum xnn_status xnn_define_global_sum_pooling_2d( return xnn_status_success; } + +enum xnn_status xnn_define_convert(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_convert, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_abs(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_abs, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_bankers_rounding(xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_bankers_rounding, NULL, input_id, + output_id, flags); +} + +enum xnn_status xnn_define_ceiling(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_ceiling, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_clamp(xnn_subgraph_t subgraph, float output_min, + float output_max, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + union xnn_unary_params params; + params.clamp.min = output_min; + params.clamp.max = output_max; + return xnn_define_unary(subgraph, xnn_unary_clamp, ¶ms, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_elu(xnn_subgraph_t subgraph, float alpha, + uint32_t input_id, uint32_t output_id, + uint32_t flags) { + union xnn_unary_params params; + params.elu.alpha = alpha; + return xnn_define_unary(subgraph, xnn_unary_elu, ¶ms, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_exp(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_exp, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_floor(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_floor, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_gelu(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_gelu, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_hardswish(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_hardswish, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_leaky_relu(xnn_subgraph_t subgraph, + float negative_slope, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + union xnn_unary_params params; + params.leaky_relu.negative_slope = negative_slope; + return xnn_define_unary(subgraph, xnn_unary_leaky_relu, ¶ms, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_log(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_log, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_negate(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_negate, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_sigmoid(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_sigmoid, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_square(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_square, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_square_root(xnn_subgraph_t subgraph, + uint32_t input_id, uint32_t output_id, + uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_square_root, NULL, input_id, output_id, + flags); +} + +enum xnn_status xnn_define_reciprocal_square_root(xnn_subgraph_t subgraph, + uint32_t input_id, + uint32_t output_id, + uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_reciprocal_square_root, NULL, input_id, + output_id, flags); +} + +enum xnn_status xnn_define_tanh(xnn_subgraph_t subgraph, uint32_t input_id, + uint32_t output_id, uint32_t flags) { + return xnn_define_unary(subgraph, xnn_unary_tanh, NULL, input_id, output_id, + flags); +} diff --git a/src/subgraph/elu.c b/src/subgraph/elu.c deleted file mode 100644 index 3e408b797ade..000000000000 --- a/src/subgraph/elu.c +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_elu_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - enum xnn_status status; - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_elu_nc_f16( - node->params.elu.alpha, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_elu_nc_f32( - node->params.elu.alpha, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - status = xnn_create_elu_nc_qs8( - node->params.elu.alpha, - (int8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - (int8_t) values[output_id].quantization.zero_point, - values[output_id].quantization.scale, - INT8_MIN, INT8_MAX, - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_elu_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_elu_nc_f16: - status = xnn_reshape_elu_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_elu_nc_f32: - status = xnn_reshape_elu_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_elu_nc_qs8: - status = xnn_reshape_elu_nc_qs8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_elu_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_elu_nc_f16: - return xnn_setup_elu_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_elu_nc_f32: - return xnn_setup_elu_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_elu_nc_qs8: - return xnn_setup_elu_nc_qs8( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_elu( - xnn_subgraph_t subgraph, - float alpha, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_elu)) != xnn_status_success) { - return status; - } - - if (alpha <= 0.0f || !isnormal(alpha)) { - xnn_log_error( - "failed to define %s operator with %.7g alpha parameter: alpha must be finite, normalized, and positive", - xnn_node_type_to_string(xnn_node_type_elu), alpha); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_elu, input_id, subgraph->num_values)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_elu, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_elu), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_elu, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_elu, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_elu), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_elu, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_elu; - node->compute_type = compute_type; - node->params.elu.alpha = alpha; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_elu_operator; - node->reshape = reshape_elu_operator; - node->setup = setup_elu_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/exp.c b/src/subgraph/exp.c deleted file mode 100644 index 7fc3916ec599..000000000000 --- a/src/subgraph/exp.c +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_exp_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - - assert(node->num_outputs == 1); - - const uint32_t input_id = node->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - enum xnn_status status; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_exp_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_exp_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_exp_nc_f32: - status = xnn_reshape_exp_nc_f32( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_exp_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_exp_nc_f32: - return xnn_setup_exp_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_exp( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_exp)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_exp, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_exp, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_exp), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_exp, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_exp, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_exp), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - assert(compute_type != xnn_compute_type_invalid); - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_exp, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_exp; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_exp_operator; - node->reshape = reshape_exp_operator; - node->setup = setup_exp_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/floor.c b/src/subgraph/floor.c deleted file mode 100644 index 8e822cc817b6..000000000000 --- a/src/subgraph/floor.c +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_floor_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - const uint32_t input_id = node->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - enum xnn_status status; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_floor_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_floor_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_floor_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_floor_nc_f32: - status = xnn_reshape_floor_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_floor_nc_f16: - status = xnn_reshape_floor_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_floor_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_floor_nc_f32: - return xnn_setup_floor_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_floor_nc_f16: - return xnn_setup_floor_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_floor( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_floor)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_floor, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_floor, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_floor), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_floor, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_floor, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_floor), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_floor; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_floor_operator; - node->reshape = reshape_floor_operator; - node->setup = setup_floor_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/gelu.c b/src/subgraph/gelu.c deleted file mode 100644 index a338ad251c33..000000000000 --- a/src/subgraph/gelu.c +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_gelu_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - enum xnn_status status; - switch (node->compute_type) { - case xnn_compute_type_fp32: - status = xnn_create_gelu_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_gelu_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_gelu_nc_f32: - status = xnn_reshape_gelu_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_gelu_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_gelu_nc_f32: - return xnn_setup_gelu_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_gelu( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_gelu)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_gelu, input_id, subgraph->num_values)) != xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_gelu, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_gelu), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_gelu, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_gelu, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_gelu), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_gelu, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_gelu; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_gelu_operator; - node->reshape = reshape_gelu_operator; - node->setup = setup_gelu_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/hardswish.c b/src/subgraph/hardswish.c deleted file mode 100644 index a712b87d870e..000000000000 --- a/src/subgraph/hardswish.c +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_hardswish_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - const uint32_t input_id = node->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - enum xnn_status status; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_hardswish_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_hardswish_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_hardswish_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_hardswish_nc_f32: - status = xnn_reshape_hardswish_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_hardswish_nc_f16: - status = xnn_reshape_hardswish_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_hardswish_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_hardswish_nc_f32: - return xnn_setup_hardswish_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_hardswish_nc_f16: - return xnn_setup_hardswish_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_hardswish( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_hardswish)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_hardswish, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_hardswish, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - break; - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_hardswish), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_hardswish, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_hardswish, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_hardswish), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_hardswish; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_hardswish_operator; - node->reshape = reshape_hardswish_operator; - node->setup = setup_hardswish_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/leaky-relu.c b/src/subgraph/leaky-relu.c deleted file mode 100644 index dd76a587b751..000000000000 --- a/src/subgraph/leaky-relu.c +++ /dev/null @@ -1,305 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_leaky_relu_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - const uint32_t input_id = node->inputs[0]; - assert(input_id < num_values); - - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value *input_value = &values[input_id]; - enum xnn_status status; - switch (input_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_leaky_relu_nc_f16( - node->params.leaky_relu.negative_slope, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_leaky_relu_nc_f32( - node->params.leaky_relu.negative_slope, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - status = xnn_create_leaky_relu_nc_qs8( - node->params.leaky_relu.negative_slope, - (int8_t) values[input_id].quantization.zero_point, values[input_id].quantization.scale, - (int8_t) values[output_id].quantization.zero_point, values[output_id].quantization.scale, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_quint8: - status = xnn_create_leaky_relu_nc_qu8( - node->params.leaky_relu.negative_slope, - (uint8_t) values[input_id].quantization.zero_point, values[input_id].quantization.scale, - (uint8_t) values[output_id].quantization.zero_point, values[output_id].quantization.scale, - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_leaky_relu_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_leaky_relu_nc_f16: - status = xnn_reshape_leaky_relu_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_leaky_relu_nc_f32: - status = xnn_reshape_leaky_relu_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_leaky_relu_nc_qs8: - status = xnn_reshape_leaky_relu_nc_qs8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_leaky_relu_nc_qu8: - status = xnn_reshape_leaky_relu_nc_qu8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_leaky_relu_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_leaky_relu_nc_f16: - return xnn_setup_leaky_relu_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_leaky_relu_nc_f32: - return xnn_setup_leaky_relu_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_leaky_relu_nc_qs8: - return xnn_setup_leaky_relu_nc_qs8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_leaky_relu_nc_qu8: - return xnn_setup_leaky_relu_nc_qu8( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_leaky_relu( - xnn_subgraph_t subgraph, - float negative_slope, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_leaky_relu)) != xnn_status_success) { - return status; - } - - if (!isfinite(negative_slope)) { - xnn_log_error( - "failed to create %s operator with %f negative slope: finite number expected", - xnn_node_type_to_string(xnn_node_type_leaky_relu), - negative_slope); - return xnn_status_invalid_parameter; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_leaky_relu, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_leaky_relu, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_leaky_relu), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_leaky_relu, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_leaky_relu, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - case xnn_datatype_quint8: - compute_type = xnn_compute_type_qu8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_leaky_relu), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - assert(compute_type != xnn_compute_type_invalid); - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_leaky_relu, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - if (compute_type == xnn_compute_type_qs8 || compute_type == xnn_compute_type_qu8) { - const float positive_input_output_scale = input_value->quantization.scale / output_value->quantization.scale; - if (positive_input_output_scale < 0x1.0p-8f || positive_input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to define %s operator with %.7g positive-input-to-output scale ratio: scale ratio must be in [2**-8, 2**7] range", - xnn_node_type_to_string(xnn_node_type_leaky_relu), positive_input_output_scale); - return xnn_status_invalid_parameter; - } - - const float negative_input_output_scale = positive_input_output_scale * negative_slope; - if (negative_input_output_scale < -0x1.FFFC00p+6f || negative_input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to define %s operator with %.7g negative-input-to-output scale ratio: scale ratio must be in (-2**7, 2**7] range and ", - xnn_node_type_to_string(xnn_node_type_leaky_relu), negative_input_output_scale); - return xnn_status_invalid_parameter; - } - - if (fabsf(negative_input_output_scale) < 0x1.0p-8f) { - xnn_log_error( - "failed to define %s operator with %.7g negative-input-to-output scale ratio: scale ratio must be at least 2**-8 in absolute value", - xnn_node_type_to_string(xnn_node_type_leaky_relu), negative_input_output_scale); - return xnn_status_invalid_parameter; - } - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_leaky_relu; - node->compute_type = compute_type; - node->params.leaky_relu.negative_slope = negative_slope; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_leaky_relu_operator; - node->reshape = reshape_leaky_relu_operator; - node->setup = setup_leaky_relu_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/log.c b/src/subgraph/log.c deleted file mode 100644 index d64bf803df99..000000000000 --- a/src/subgraph/log.c +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_log_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - - assert(node->num_outputs == 1); - - const uint32_t input_id = node->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - enum xnn_status status; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_log_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_log_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_log_nc_f32: - status = xnn_reshape_log_nc_f32( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_log_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_log_nc_f32: - return xnn_setup_log_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_log( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_log)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_log, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_log, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_log), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_log, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_log, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_log), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - assert(compute_type != xnn_compute_type_invalid); - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_log, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_log; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_log_operator; - node->reshape = reshape_log_operator; - node->setup = setup_log_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/negate.c b/src/subgraph/negate.c deleted file mode 100644 index f6a1994a2c09..000000000000 --- a/src/subgraph/negate.c +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_negate_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input_id = node->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_negate_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_negate_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_negate_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_negate_nc_f32: - status = xnn_reshape_negate_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_negate_nc_f16: - status = xnn_reshape_negate_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_negate_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_negate_nc_f32: - return xnn_setup_negate_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_negate_nc_f16: - return xnn_setup_negate_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_negate( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_negate)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_negate, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_negate, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_negate), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_negate, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_negate, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_negate), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_negate; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_negate_operator; - node->reshape = reshape_negate_operator; - node->setup = setup_negate_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/reciprocal-square-root.c b/src/subgraph/reciprocal-square-root.c deleted file mode 100644 index df8e919b32b2..000000000000 --- a/src/subgraph/reciprocal-square-root.c +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_reciprocal_square_root_operator( - const struct xnn_node* node, const struct xnn_value* values, - size_t num_values, struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, xnn_weights_cache_t weights_cache) { - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input_id = node->inputs[0]; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_reciprocal_square_root_nc_f16( - node->flags, &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_reciprocal_square_root_nc_f32( - node->flags, &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_reciprocal_square_root_operator( - struct xnn_operator_data* opdata, struct xnn_value* values, - size_t num_values, pthreadpool_t threadpool) { - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = - xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = - num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_reciprocal_square_root_nc_f16: - status = xnn_reshape_reciprocal_square_root_nc_f16( - opdata->operator_objects[0], batch_size, /*channels=*/channel_dim, - /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_reciprocal_square_root_nc_f32: - status = xnn_reshape_reciprocal_square_root_nc_f32( - opdata->operator_objects[0], batch_size, /*channels=*/channel_dim, - /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, - old_workspace_size, threadpool); -} - -static enum xnn_status setup_reciprocal_square_root_operator( - const struct xnn_operator_data* opdata, const struct xnn_value* values, - size_t num_values, pthreadpool_t threadpool) { - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_reciprocal_square_root_nc_f16: - return xnn_setup_reciprocal_square_root_nc_f16( - opdata->operator_objects[0], input_data, output_data); - case xnn_operator_type_reciprocal_square_root_nc_f32: - return xnn_setup_reciprocal_square_root_nc_f32( - opdata->operator_objects[0], input_data, output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_reciprocal_square_root(xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) { - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized( - xnn_node_type_reciprocal_square_root)) != xnn_status_success) { - return status; - } - - if (input_id >= subgraph->num_values) { - xnn_log_error("failed to define %s operator with input ID #%" PRIu32 - ": invalid Value ID", - xnn_node_type_to_string(xnn_node_type_reciprocal_square_root), - input_id); - return xnn_status_invalid_parameter; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense( - xnn_node_type_reciprocal_square_root, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 - ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_reciprocal_square_root), - input_id, xnn_datatype_to_string(input_value->datatype), - input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id( - xnn_node_type_reciprocal_square_root, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense( - xnn_node_type_reciprocal_square_root, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 - ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_reciprocal_square_root), - output_id, xnn_datatype_to_string(output_value->datatype), - output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_reciprocal_square_root; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_reciprocal_square_root_operator; - node->reshape = reshape_reciprocal_square_root_operator; - node->setup = setup_reciprocal_square_root_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/sigmoid.c b/src/subgraph/sigmoid.c deleted file mode 100644 index a7cdd3a80f22..000000000000 --- a/src/subgraph/sigmoid.c +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_sigmoid_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - const uint32_t input_id = node->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - enum xnn_status status; - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_sigmoid_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_sigmoid_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - { - status = xnn_create_sigmoid_nc_qs8( - (int8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - (int8_t) values[output_id].quantization.zero_point, - values[output_id].quantization.scale, - INT8_MIN, INT8_MAX, - node->flags, - &opdata->operator_objects[0]); - break; - } - case xnn_datatype_quint8: - { - status = xnn_create_sigmoid_nc_qu8( - (uint8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - (uint8_t) values[output_id].quantization.zero_point, - values[output_id].quantization.scale, - 0, UINT8_MAX, - node->flags, - &opdata->operator_objects[0]); - break; - } - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_sigmoid_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_sigmoid_nc_f16: - status = xnn_reshape_sigmoid_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_sigmoid_nc_f32: - status = xnn_reshape_sigmoid_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_sigmoid_nc_qs8: - status = xnn_reshape_sigmoid_nc_qs8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_sigmoid_nc_qu8: - status = xnn_reshape_sigmoid_nc_qu8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_sigmoid_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_sigmoid_nc_f16: - return xnn_setup_sigmoid_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_sigmoid_nc_f32: - return xnn_setup_sigmoid_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_sigmoid_nc_qs8: - return xnn_setup_sigmoid_nc_qs8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_sigmoid_nc_qu8: - return xnn_setup_sigmoid_nc_qu8( - opdata->operator_objects[0], - input_data, - output_data); - break; - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_sigmoid( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_sigmoid)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_sigmoid, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_sigmoid, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_sigmoid), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_sigmoid, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_sigmoid, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - case xnn_datatype_quint8: - compute_type = xnn_compute_type_qu8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_sigmoid), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_subtract, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_sigmoid; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_sigmoid_operator; - node->reshape = reshape_sigmoid_operator; - node->setup = setup_sigmoid_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/square-root.c b/src/subgraph/square-root.c deleted file mode 100644 index c48628a335ac..000000000000 --- a/src/subgraph/square-root.c +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_square_root_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - enum xnn_status status; - const uint32_t input_id = node->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp32: - status = xnn_create_square_root_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_square_root_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_square_root_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_square_root_nc_f32: - status = xnn_reshape_square_root_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_square_root_nc_f16: - status = xnn_reshape_square_root_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_square_root_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_square_root_nc_f32: - return xnn_setup_square_root_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_square_root_nc_f16: - return xnn_setup_square_root_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_square_root( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_square_root)) != xnn_status_success) { - return status; - } - - if (input_id >= subgraph->num_values) { - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": invalid Value ID", - xnn_node_type_to_string(xnn_node_type_square_root), input_id); - return xnn_status_invalid_parameter; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_square_root, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_square_root), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_square_root, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_square_root, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_square_root), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_square_root; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_square_root_operator; - node->reshape = reshape_square_root_operator; - node->setup = setup_square_root_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/square.c b/src/subgraph/square.c deleted file mode 100644 index 6592917a69f6..000000000000 --- a/src/subgraph/square.c +++ /dev/null @@ -1,206 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_square_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - assert(node->num_outputs == 1); - - enum xnn_status status; - switch (node->compute_type) { - case xnn_compute_type_fp32: - status = xnn_create_square_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_compute_type_fp16: - status = xnn_create_square_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_square_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_square_nc_f32: - status = xnn_reshape_square_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_square_nc_f16: - status = xnn_reshape_square_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_square_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_square_nc_f32: - return xnn_setup_square_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_square_nc_f16: - return xnn_setup_square_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_square( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_square)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_square, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_square, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_square), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_square, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_square, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_square), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_square; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_square_operator; - node->reshape = reshape_square_operator; - node->setup = setup_square_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/tanh.c b/src/subgraph/tanh.c deleted file mode 100644 index d67082b4352b..000000000000 --- a/src/subgraph/tanh.c +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/common.h" -#include "xnnpack/log.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/reshape-helpers.h" -#include "xnnpack/subgraph-validation.h" -#include "xnnpack/subgraph.h" -#include "pthreadpool.h" - -static enum xnn_status create_tanh_operator( - const struct xnn_node* node, - const struct xnn_value* values, - size_t num_values, - struct xnn_operator_data* opdata, - struct xnn_code_cache* code_cache, - xnn_weights_cache_t weights_cache) -{ - assert(node->num_inputs == 1); - const uint32_t input_id = node->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - assert(node->num_outputs == 1); - const uint32_t output_id = node->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - enum xnn_status status; - const struct xnn_value *input_value = &values[input_id]; - switch (input_value->datatype) { - case xnn_datatype_fp16: - status = xnn_create_tanh_nc_f16( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_tanh_nc_f32( - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_qint8: - { - status = xnn_create_tanh_nc_qs8( - (int8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - (int8_t) values[output_id].quantization.zero_point, - values[output_id].quantization.scale, - INT8_MIN, INT8_MAX, - node->flags, - &opdata->operator_objects[0]); - break; - } - case xnn_datatype_quint8: - { - status = xnn_create_tanh_nc_qu8( - (uint8_t) values[input_id].quantization.zero_point, - values[input_id].quantization.scale, - (uint8_t) values[output_id].quantization.zero_point, - values[output_id].quantization.scale, - 0, UINT8_MAX, - node->flags, - &opdata->operator_objects[0]); - break; - } - default: - XNN_UNREACHABLE; - } - return status; -} - -static enum xnn_status reshape_tanh_operator( - struct xnn_operator_data* opdata, - struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id < num_values); - const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); - const size_t num_input_dims = values[input_id].shape.num_dims; - const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; - const size_t old_workspace_size = opdata->workspace_size; - enum xnn_status status = xnn_status_invalid_state; - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_tanh_nc_f16: - status = xnn_reshape_tanh_nc_f16( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_tanh_nc_f32: - status = xnn_reshape_tanh_nc_f32( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_tanh_nc_qs8: - status = xnn_reshape_tanh_nc_qs8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - case xnn_operator_type_tanh_nc_qu8: - status = xnn_reshape_tanh_nc_qu8( - opdata->operator_objects[0], - batch_size, - channel_dim /* channels */, channel_dim /* input stride */, channel_dim /* output stride */, - threadpool); - break; - default: - XNN_UNREACHABLE; - } - if (status != xnn_status_success) { - return status; - } - return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); -} - -static enum xnn_status setup_tanh_operator( - const struct xnn_operator_data* opdata, - const struct xnn_value* values, - size_t num_values, - pthreadpool_t threadpool) -{ - const uint32_t input_id = opdata->inputs[0]; - assert(input_id != XNN_INVALID_VALUE_ID); - assert(input_id < num_values); - - const uint32_t output_id = opdata->outputs[0]; - assert(output_id != XNN_INVALID_VALUE_ID); - assert(output_id < num_values); - - const struct xnn_value* input_value = values + input_id; - const void* input_data = input_value->data; - assert(input_data != NULL); - - const struct xnn_value* output_value = values + output_id; - void* output_data = output_value->data; - assert(output_data != NULL); - - switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_tanh_nc_f16: - return xnn_setup_tanh_nc_f16( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_tanh_nc_f32: - return xnn_setup_tanh_nc_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_tanh_nc_qs8: - return xnn_setup_tanh_nc_qs8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_tanh_nc_qu8: - return xnn_setup_tanh_nc_qu8( - opdata->operator_objects[0], - input_data, - output_data); - break; - default: - XNN_UNREACHABLE; - } -} - -enum xnn_status xnn_define_tanh( - xnn_subgraph_t subgraph, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_tanh)) != xnn_status_success) { - return status; - } - - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_tanh, input_id, subgraph->num_values)) != - xnn_status_success) { - return status; - } - - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_tanh, input_id, input_value); - if (status != xnn_status_success) { - return status; - } - - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_tanh), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_output_node_id(xnn_node_type_tanh, output_id, subgraph->num_values); - if (status != xnn_status_success) { - return status; - } - - const struct xnn_value* output_value = &subgraph->values[output_id]; - status = xnn_subgraph_check_output_type_dense(xnn_node_type_tanh, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - enum xnn_compute_type compute_type = xnn_compute_type_invalid; - switch (output_value->datatype) { - case xnn_datatype_fp16: - compute_type = xnn_compute_type_fp16; - break; - case xnn_datatype_fp32: - compute_type = xnn_compute_type_fp32; - break; - case xnn_datatype_qint8: - compute_type = xnn_compute_type_qs8; - break; - case xnn_datatype_quint8: - compute_type = xnn_compute_type_qu8; - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_tanh), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - status = xnn_subgraph_check_datatype_matches(xnn_node_type_subtract, input_id, input_value, output_id, output_value); - if (status != xnn_status_success) { - return status; - } - - struct xnn_node* node = xnn_subgraph_new_node(subgraph); - if (node == NULL) { - return xnn_status_out_of_memory; - } - - node->type = xnn_node_type_tanh; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_tanh_operator; - node->reshape = reshape_tanh_operator; - node->setup = setup_tanh_operator; - - return xnn_status_success; -} diff --git a/src/subgraph/convert.c b/src/subgraph/unary.c similarity index 50% rename from src/subgraph/convert.c rename to src/subgraph/unary.c index b722dee5e4b8..1bf2cbd9562a 100644 --- a/src/subgraph/convert.c +++ b/src/subgraph/unary.c @@ -1,4 +1,4 @@ -// Copyright 2021 Google LLC +// Copyright 2020 Google LLC // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. @@ -13,6 +13,7 @@ #include "xnnpack/config.h" #include "xnnpack/internal.h" #include "xnnpack/log.h" +#include "xnnpack/microparams.h" #include "xnnpack/node-type.h" #include "xnnpack/operator-type.h" #include "xnnpack/operator.h" @@ -21,6 +22,7 @@ #include "xnnpack/subgraph.h" #include "pthreadpool.h" + static enum xnn_status create_convert_operator( const struct xnn_node* node, const struct xnn_value* values, @@ -41,16 +43,11 @@ static enum xnn_status create_convert_operator( const struct xnn_value* output_value = values + output_id; enum xnn_status status = xnn_status_uninitialized; - const enum xnn_datatype input_datatype = values[input_id].datatype; - const enum xnn_datatype output_datatype = values[output_id].datatype; + const enum xnn_datatype input_datatype = input_value->datatype; + const enum xnn_datatype output_datatype = output_value->datatype; switch (input_datatype) { case xnn_datatype_fp32: switch (output_datatype) { - case xnn_datatype_fp16: - status = xnn_create_convert_nc_f32_f16( - node->flags, - &opdata->operator_objects[0]); - break; case xnn_datatype_qdint8: status = xnn_create_convert_nc_f32_qd8( node->flags, @@ -60,22 +57,8 @@ static enum xnn_status create_convert_operator( status = xnn_create_convert_nc_f32_qp8(node->flags, &opdata->operator_objects[0]); break; - case xnn_datatype_qint8: - status = xnn_create_convert_nc_f32_qs8( - output_value->quantization.scale, - (int8_t) output_value->quantization.zero_point, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_quint8: - status = xnn_create_convert_nc_f32_qu8( - output_value->quantization.scale, - (uint8_t) output_value->quantization.zero_point, - node->flags, - &opdata->operator_objects[0]); - break; default: - XNN_UNREACHABLE; + break; } break; case xnn_datatype_fp16: @@ -85,68 +68,18 @@ static enum xnn_status create_convert_operator( node->flags, &opdata->operator_objects[0]); break; - case xnn_datatype_fp32: - status = xnn_create_convert_nc_f16_f32( - node->flags, - &opdata->operator_objects[0]); - break; - default: - XNN_UNREACHABLE; - } - break; - case xnn_datatype_qint8: - switch (output_datatype) { - case xnn_datatype_qint8: - status = xnn_create_convert_nc_qs8( - input_value->quantization.scale, - (int8_t) input_value->quantization.zero_point, - output_value->quantization.scale, - (int8_t) output_value->quantization.zero_point, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp16: - status = xnn_create_convert_nc_qs8_f16( - input_value->quantization.scale, - (int8_t) input_value->quantization.zero_point, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_convert_nc_qs8_f32( - input_value->quantization.scale, - (int8_t) input_value->quantization.zero_point, - node->flags, - &opdata->operator_objects[0]); - break; default: - XNN_UNREACHABLE; - } - break; - case xnn_datatype_quint8: - switch (output_datatype) { - case xnn_datatype_quint8: - status = xnn_create_convert_nc_qu8( - input_value->quantization.scale, - (uint8_t) input_value->quantization.zero_point, - output_value->quantization.scale, - (uint8_t) output_value->quantization.zero_point, - node->flags, - &opdata->operator_objects[0]); - break; - case xnn_datatype_fp32: - status = xnn_create_convert_nc_qu8_f32( - input_value->quantization.scale, - (uint8_t) input_value->quantization.zero_point, - node->flags, - &opdata->operator_objects[0]); break; - default: - XNN_UNREACHABLE; } break; default: - XNN_UNREACHABLE; + break; + } + if (status == xnn_status_uninitialized) { + status = xnn_create_unary_elementwise_nc(xnn_unary_convert, input_datatype, + output_datatype, NULL, NULL, NULL, + node->flags, + &opdata->operator_objects[0]); } return status; } @@ -167,13 +100,6 @@ static enum xnn_status reshape_convert_operator( enum xnn_status status = xnn_status_invalid_state; switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_convert_nc_f32_f16: - status = xnn_reshape_convert_nc_f32_f16( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; case xnn_operator_type_convert_nc_f16_qd8: { // Channel stride depends on number of non batch dims. const uint32_t output_id = opdata->outputs[0]; @@ -216,64 +142,13 @@ static enum xnn_status reshape_convert_operator( threadpool); break; } - case xnn_operator_type_convert_nc_f32_qs8: - status = xnn_reshape_convert_nc_f32_qs8( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_convert_nc_f32_qu8: - status = xnn_reshape_convert_nc_f32_qu8( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_convert_nc_f16_f32: - status = xnn_reshape_convert_nc_f16_f32( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_convert_nc_qs8: - status = xnn_reshape_convert_nc_qs8( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_convert_nc_qs8_f16: - status = xnn_reshape_convert_nc_qs8_f16( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_convert_nc_qs8_f32: - status = xnn_reshape_convert_nc_qs8_f32( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_convert_nc_qu8: - status = xnn_reshape_convert_nc_qu8( - opdata->operator_objects[0], - batch_size, - /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, - threadpool); - break; - case xnn_operator_type_convert_nc_qu8_f32: - status = xnn_reshape_convert_nc_qu8_f32( - opdata->operator_objects[0], - batch_size, + default: + status = xnn_reshape_unary_elementwise_nc( + opdata->operator_objects[0], + batch_size, /*channels=*/channel_dim, /*input_stride=*/channel_dim, /*output_stride=*/channel_dim, threadpool); break; - default: - XNN_UNREACHABLE; } if (status != xnn_status_success) { return status; @@ -304,11 +179,6 @@ static enum xnn_status setup_convert_operator( assert(output_data != NULL); switch (opdata->operator_objects[0]->type) { - case xnn_operator_type_convert_nc_f32_f16: - return xnn_setup_convert_nc_f32_f16( - opdata->operator_objects[0], - input_data, - output_data); case xnn_operator_type_convert_nc_f16_qd8: { void* quantization_params = output_value->quantization.dynamic_params; @@ -332,49 +202,111 @@ static enum xnn_status setup_convert_operator( case xnn_operator_type_convert_nc_f32_qp8: return xnn_setup_convert_nc_f32_qp8(opdata->operator_objects[0], input_data, output_data); - case xnn_operator_type_convert_nc_f32_qs8: - return xnn_setup_convert_nc_f32_qs8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_convert_nc_f32_qu8: - return xnn_setup_convert_nc_f32_qu8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_convert_nc_f16_f32: - return xnn_setup_convert_nc_f16_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_convert_nc_qs8: - return xnn_setup_convert_nc_qs8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_convert_nc_qs8_f16: - return xnn_setup_convert_nc_qs8_f16( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_convert_nc_qs8_f32: - return xnn_setup_convert_nc_qs8_f32( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_convert_nc_qu8: - return xnn_setup_convert_nc_qu8( - opdata->operator_objects[0], - input_data, - output_data); - case xnn_operator_type_convert_nc_qu8_f32: - return xnn_setup_convert_nc_qu8_f32( - opdata->operator_objects[0], - input_data, - output_data); default: - XNN_UNREACHABLE; + return xnn_setup_unary_elementwise_nc(opdata->operator_objects[0], input_data, output_data); + } +} + +void xnn_init_convert_node( + struct xnn_node* node, + enum xnn_compute_type compute_type, + uint32_t input_id, + uint32_t output_id, + uint32_t flags) +{ + node->type = xnn_node_type_convert; + node->compute_type = compute_type; + node->num_inputs = 1; + node->inputs[0] = input_id; + node->num_outputs = 1; + node->outputs[0] = output_id; + node->flags = flags; + + node->create = create_convert_operator; + node->reshape = reshape_convert_operator; + node->setup = setup_convert_operator; +} + +static enum xnn_status create_unary_operator( + const struct xnn_node* node, + const struct xnn_value* values, + size_t num_values, + struct xnn_operator_data* opdata, + struct xnn_code_cache* code_cache, + xnn_weights_cache_t weights_cache) +{ + assert(node->num_inputs == 1); + assert(node->num_outputs == 1); + + const struct xnn_value* value_in = &values[node->inputs[0]]; + const struct xnn_value* value_out = &values[node->outputs[0]]; + + struct xnn_quantization_params in_quantization = { + .scale = value_in->quantization.scale, + .zero_point = value_in->quantization.zero_point, + }; + struct xnn_quantization_params out_quantization = { + .scale = value_out->quantization.scale, + .zero_point = value_out->quantization.zero_point, + }; + + return xnn_create_unary_elementwise_nc( + xnn_node_type_to_unary_operator(node->type), + value_in->datatype, + value_out->datatype, + &node->params.unary, + &in_quantization, + &out_quantization, + node->flags, + &opdata->operator_objects[0]); +} + +static enum xnn_status reshape_unary_operator( + struct xnn_operator_data* opdata, + struct xnn_value* values, + size_t num_values, + pthreadpool_t threadpool) +{ + xnn_operator_t op = opdata->operator_objects[0]; + const uint32_t input_id = opdata->inputs[0]; + assert(input_id < num_values); + const size_t batch_size = xnn_shape_multiply_non_channel_dims(&values[input_id].shape); + const size_t num_input_dims = values[input_id].shape.num_dims; + const size_t channel_dim = num_input_dims == 0 ? 1 : values[input_id].shape.dim[num_input_dims - 1]; + const size_t old_workspace_size = opdata->workspace_size; + + enum xnn_status status = xnn_reshape_unary_elementwise_nc(op, batch_size, channel_dim, channel_dim, channel_dim, threadpool); + if (status != xnn_status_success) { + return status; } + return resize_unary_elementwise_output_tensor(opdata, values, num_values, old_workspace_size, threadpool); +} + +static enum xnn_status setup_unary_operator( + const struct xnn_operator_data* opdata, + const struct xnn_value* values, + size_t num_values, + pthreadpool_t threadpool) +{ + const uint32_t input_id = opdata->inputs[0]; + assert(input_id != XNN_INVALID_VALUE_ID); + assert(input_id < num_values); + + const uint32_t output_id = opdata->outputs[0]; + assert(output_id != XNN_INVALID_VALUE_ID); + assert(output_id < num_values); + + const struct xnn_value* input_value = values + input_id; + const void* input_data = input_value->data; + assert(input_data != NULL); + + const struct xnn_value* output_value = values + output_id; + void* output_data = output_value->data; + assert(output_data != NULL); + + xnn_operator_t op = opdata->operator_objects[0]; + + return xnn_setup_unary_elementwise_nc(op, input_data, output_data); } static inline enum xnn_compute_type validate_datatypes( @@ -384,6 +316,8 @@ static inline enum xnn_compute_type validate_datatypes( switch (input_datatype) { case xnn_datatype_fp32: switch (output_datatype) { + case xnn_datatype_fp32: + return xnn_compute_type_fp32; case xnn_datatype_fp16: return xnn_compute_type_fp32_to_fp16; case xnn_datatype_qdint8: @@ -400,10 +334,14 @@ static inline enum xnn_compute_type validate_datatypes( break; case xnn_datatype_fp16: switch (output_datatype) { - case (xnn_datatype_qdint8): - return xnn_compute_type_fp16_to_qd8; - case (xnn_datatype_fp32): + case xnn_datatype_fp32: return xnn_compute_type_fp16_to_fp32; + case xnn_datatype_fp16: + return xnn_compute_type_fp16; + case xnn_datatype_qint8: + return xnn_compute_type_fp16_to_qs8; + case xnn_datatype_qdint8: + return xnn_compute_type_fp16_to_qd8; default: break; } @@ -436,63 +374,46 @@ static inline enum xnn_compute_type validate_datatypes( return xnn_compute_type_invalid; } -void xnn_init_convert_node( - struct xnn_node* node, - enum xnn_compute_type compute_type, - uint32_t input_id, - uint32_t output_id, - uint32_t flags) -{ - node->type = xnn_node_type_convert; - node->compute_type = compute_type; - node->num_inputs = 1; - node->inputs[0] = input_id; - node->num_outputs = 1; - node->outputs[0] = output_id; - node->flags = flags; - - node->create = create_convert_operator; - node->reshape = reshape_convert_operator; - node->setup = setup_convert_operator; -} - -enum xnn_status xnn_define_convert( +enum xnn_status xnn_define_unary( xnn_subgraph_t subgraph, + enum xnn_unary_operator type, + const union xnn_unary_params* params, uint32_t input_id, uint32_t output_id, uint32_t flags) { + enum xnn_node_type node_type = xnn_unary_operator_to_node_type(type); + enum xnn_status status; - if ((status = xnn_subgraph_check_xnnpack_initialized(xnn_node_type_convert)) != xnn_status_success) { + if ((status = xnn_subgraph_check_xnnpack_initialized(node_type)) != xnn_status_success) { return status; } - if ((status = xnn_subgraph_check_input_node_id(xnn_node_type_convert, input_id, subgraph->num_values)) != - xnn_status_success) { + if ((status = xnn_subgraph_check_input_node_id(node_type, input_id, subgraph->num_values)) != xnn_status_success) { return status; } - const struct xnn_value* input_value = &subgraph->values[input_id]; - status = xnn_subgraph_check_input_type_dense(xnn_node_type_convert, input_id, input_value); - if (status != xnn_status_success) { + if ((status = xnn_subgraph_check_output_node_id(node_type, output_id, subgraph->num_values)) != xnn_status_success) { return status; } - switch (input_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; + switch (type) { + case xnn_unary_clamp: + case xnn_unary_leaky_relu: + case xnn_unary_elu: + if (!params) { + xnn_log_error( + "failed to define %s node with input ID #%" PRIu32 " and output ID #%" PRIu32 + ": missing clamp params", + xnn_node_type_to_string(node_type), input_id, output_id); + return xnn_status_invalid_parameter; + } default: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_convert), input_id, - xnn_datatype_to_string(input_value->datatype), input_value->datatype); - return xnn_status_invalid_parameter; + break; } - status = xnn_subgraph_check_output_node_id(xnn_node_type_convert, output_id, subgraph->num_values); + const struct xnn_value* input_value = &subgraph->values[input_id]; + status = xnn_subgraph_check_input_type_dense(xnn_node_type_convert, input_id, input_value); if (status != xnn_status_success) { return status; } @@ -503,83 +424,55 @@ enum xnn_status xnn_define_convert( return status; } - switch (output_value->datatype) { - case xnn_datatype_fp16: - case xnn_datatype_fp32: - case xnn_datatype_qdint8: - // TODO(b/340399245) - Uncomment once we have full support for `qpint8`. - // case xnn_datatype_qpint8: - case xnn_datatype_qint8: - case xnn_datatype_quint8: - break; - default: - xnn_log_error( - "failed to define %s operator with output ID #%" PRIu32 ": unsupported Value datatype %s (%d)", - xnn_node_type_to_string(xnn_node_type_convert), output_id, - xnn_datatype_to_string(output_value->datatype), output_value->datatype); - return xnn_status_invalid_parameter; - } - - // Coerce the input from `xnn_datatype_qdint8` to `xnn_datatype_qpint8` if we - // know that we're converting for a GEMM and `qp8_f32_*` kernels are - // available. - // TODO(b/340399245) - Remove xnn_init_qp8_f32_qc4w_gemm_config check once we - // have full qp8 support. - bool pack_activation_for_qc4w = ( - (flags & XNN_FLAG_MAYBE_PACK_FOR_GEMM) && - xnn_init_qp8_f32_qc4w_gemm_config() != NULL - ); - bool pack_activation_for_qb4w = ( - (flags & XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM) && - xnn_init_qp8_f32_qb4w_gemm_config() != NULL - ); - if ((pack_activation_for_qb4w || pack_activation_for_qc4w) && - input_value->datatype == xnn_datatype_fp32 && - output_value->datatype == xnn_datatype_qdint8) { - xnn_log_debug("Coercing type of output ID #%" PRIu32 - " of %s operator from `%s` to `%s`.", - output_id, xnn_node_type_to_string(xnn_node_type_convert), - xnn_datatype_to_string(output_value->datatype), - xnn_datatype_to_string(xnn_datatype_qpint8)); - subgraph->values[output_id].datatype = xnn_datatype_qpint8; - } - enum xnn_compute_type compute_type = validate_datatypes(input_value->datatype, output_value->datatype); if (compute_type == xnn_compute_type_invalid) { xnn_log_error( "failed to define %s operator with input ID #%" PRIu32 " and output ID #%" PRIu32 - ": mismatching datatypes across input (%s) and output (%s)", + ": unsupported datatype input (%s) and output (%s)", xnn_node_type_to_string(xnn_node_type_convert), input_id, output_id, xnn_datatype_to_string(input_value->datatype), xnn_datatype_to_string(output_value->datatype)); return xnn_status_invalid_parameter; } - switch (compute_type) { - case xnn_compute_type_invalid: - xnn_log_error( - "failed to define %s operator with input ID #%" PRIu32 " and output ID #%" PRIu32 - ": mismatching datatypes across input (%s) and output (%s)", - xnn_node_type_to_string(xnn_node_type_convert), input_id, output_id, - xnn_datatype_to_string(input_value->datatype), - xnn_datatype_to_string(output_value->datatype)); - return xnn_status_invalid_parameter; - case xnn_compute_type_qs8: - case xnn_compute_type_qu8: - { - const float input_output_scale = input_value->quantization.scale / output_value->quantization.scale; - if (input_output_scale < 0x1.0p-8f || input_output_scale > 0x1.0p+7f) { - xnn_log_error( - "failed to define %s operator with %.7g input-to-output scale ratio (input #%"PRIu32" scale %.7g, output #%"PRIu32" scale %.7g): " - "scale ratio must be in [2**-8, 2**7] range", - xnn_node_type_to_string(xnn_node_type_convert), input_output_scale, - input_id, input_value->quantization.scale, output_id, output_value->quantization.scale); - return xnn_status_invalid_parameter; + if (type == xnn_unary_convert) { + // Some convert types are not elementwise ops, handle them now. + if (output_value->datatype == xnn_datatype_qdint8 || + // TODO(b/340399245) - Uncomment once we have full support for `qpint8`. + // output_value->datatype == xnn_datatype_qpint8 || + false) { + // Coerce the input from `xnn_datatype_qdint8` to `xnn_datatype_qpint8` if we + // know that we're converting for a GEMM and `qp8_f32_*` kernels are + // available. + // TODO(b/340399245) - Remove xnn_init_qp8_f32_qc4w_gemm_config check once we + // have full qp8 support. + bool pack_activation_for_qc4w = ( + (flags & XNN_FLAG_MAYBE_PACK_FOR_GEMM) && + xnn_init_qp8_f32_qc4w_gemm_config() != NULL + ); + bool pack_activation_for_qb4w = ( + (flags & XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM) && + xnn_init_qp8_f32_qb4w_gemm_config() != NULL + ); + if ((pack_activation_for_qb4w || pack_activation_for_qc4w) && + input_value->datatype == xnn_datatype_fp32 && + output_value->datatype == xnn_datatype_qdint8) { + xnn_log_debug("Coercing type of output ID #%" PRIu32 + " of %s operator from `%s` to `%s`.", + output_id, xnn_node_type_to_string(xnn_node_type_convert), + xnn_datatype_to_string(output_value->datatype), + xnn_datatype_to_string(xnn_datatype_qpint8)); + subgraph->values[output_id].datatype = xnn_datatype_qpint8; } - break; + + struct xnn_node* node = xnn_subgraph_new_node(subgraph); + if (node == NULL) { + return xnn_status_out_of_memory; + } + + xnn_init_convert_node(node, compute_type, input_id, output_id, flags); + return xnn_status_success; } - default: - break; } struct xnn_node* node = xnn_subgraph_new_node(subgraph); @@ -587,6 +480,26 @@ enum xnn_status xnn_define_convert( return xnn_status_out_of_memory; } - xnn_init_convert_node(node, compute_type, input_id, output_id, flags); + node->type = node_type; + node->compute_type = compute_type; + node->num_inputs = 1; + node->inputs[0] = input_id; + node->num_outputs = 1; + node->outputs[0] = output_id; + node->flags = flags; + if (params) { + node->params.unary = *params; + } + + if (type == xnn_unary_clamp) { + assert(params); + node->activation.output_min = params->clamp.min; + node->activation.output_max = params->clamp.max; + } + + node->create = create_unary_operator; + node->reshape = reshape_unary_operator; + node->setup = setup_unary_operator; + return xnn_status_success; } diff --git a/src/u8-vclamp/u8-vclamp.h b/src/u8-vclamp/u8-vclamp.h index 90b0cf43be3f..e957ea790848 100644 --- a/src/u8-vclamp/u8-vclamp.h +++ b/src/u8-vclamp/u8-vclamp.h @@ -17,30 +17,30 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_u8_vclamp_ukernel__neon_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_arm_neon, xnn_u8_vclamp_ukernel__neon_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__sse2_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_u8_vclamp_ukernel__avx2_u128, 128, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__sse2_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx2, xnn_u8_vclamp_ukernel__avx2_u128, 128, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #if XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_u8_vclamp_ukernel__avx512skx_u256, 256, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_x86_avx512skx, xnn_u8_vclamp_ukernel__avx512skx_u256, 256, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) #endif // XNN_ENABLE_AVX512SKX && (XNN_ARCH_X86 || XNN_ARCH_X86_64) #if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) -XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u1v, 1, true, int8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u2v, 2, true, int8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u4v, 4, true, int8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) +XNN_UKERNEL_WITH_PARAMS(xnn_arch_riscv_vector, xnn_u8_vclamp_ukernel__rvv_u8v, 8, true, int8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) #endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__wasmsimd_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__wasmsimd_u64, 64, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__scalar_u4, 4, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_u8_minmax_scalar_params) +XNN_UKERNEL_WITH_PARAMS(0, xnn_u8_vclamp_ukernel__scalar_u4, 4, false, uint8_t, struct xnn_u8_minmax_params, xnn_init_qu8_clamp_scalar_params) #ifdef XNN_DEFINED_UKERNEL_WITH_PARAMS #undef XNN_DEFINED_UKERNEL_WITH_PARAMS diff --git a/src/xnnpack/buffer.h b/src/xnnpack/buffer.h index d32deb65b790..7b8b21676908 100644 --- a/src/xnnpack/buffer.h +++ b/src/xnnpack/buffer.h @@ -8,10 +8,65 @@ #include #include +#include +#include #include +#include + +#include "xnnpack.h" +#include "xnnpack/common.h" +#include "xnnpack/math.h" namespace xnnpack { +template +xnn_datatype datatype_of() { + if (std::is_same::value) { + return xnn_datatype_quint8; + } else if (std::is_same::value) { + return xnn_datatype_qint8; + } else if (std::is_same::value) { + // TODO: We don't have this type... + return xnn_datatype_qint8; + } else if (std::is_same::value) { + return xnn_datatype_fp16; + } else if (std::is_same::value) { + return xnn_datatype_fp32; + } else if (std::is_same::value) { + return xnn_datatype_int32; + } else if (std::is_same::value) { + // TODO: We don't have this type... + return xnn_datatype_quint8; + } else { + XNN_UNREACHABLE; + } +} + +inline bool is_quantized(xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_qint8: + case xnn_datatype_quint8: + return true; + default: + return false; + } +} + +inline size_t datatype_size(xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_qint8: + case xnn_datatype_quint8: + return 1; + case xnn_datatype_fp16: + return 2; + case xnn_datatype_fp32: + case xnn_datatype_int32: + return 4; + default: + XNN_UNREACHABLE; + } +} + // This is a container similar to std::vector, but it leaves the memory // uninitialized, supports alignment. // TODO: It would be good if this also managed padding in a way that allowed @@ -65,7 +120,7 @@ class Buffer { using const_iterator = const T*; Buffer() : data_(nullptr), size_(0) {} - Buffer(size_t size) + explicit Buffer(size_t size) : data_(reinterpret_cast(allocate(size * sizeof(T)))), size_(size) {} Buffer(size_t size, T value) : Buffer(size) { std::fill(begin(), end(), value); diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h index 1d946242f373..0c1392e23ef1 100644 --- a/src/xnnpack/compute.h +++ b/src/xnnpack/compute.h @@ -1268,37 +1268,7 @@ struct univector_strided_context { void* y; size_t y_stride; xnn_vunary_ukernel_fn ukernel; - union { - struct xnn_f16_default_params f16_default; - struct xnn_f16_hswish_params f16_hswish; - struct xnn_f16_lrelu_params f16_lrelu; - union xnn_f16_minmax_params f16_minmax; - struct xnn_f16_default_params f16_neg; - struct xnn_f16_sigmoid_params f16_sigmoid; - union xnn_f16_tanh_params f16_tanh; - struct xnn_f32_default_params f32_default; - struct xnn_f32_elu_params f32_elu; - struct xnn_f32_hswish_params f32_hswish; - struct xnn_f32_lrelu_params f32_lrelu; - union xnn_f32_minmax_params f32_minmax; - struct xnn_f32_qs8_cvt_params f32_qs8_cvt; - struct xnn_f32_qu8_cvt_params f32_qu8_cvt; - struct xnn_f32_rnd_params f32_rnd; - struct xnn_f32_sigmoid_params f32_sigmoid; - struct xnn_f32_sqrt_params f32_sqrt; - union xnn_f32_tanh_params f32_tanh; - struct xnn_qs8_cvt_params qs8_cvt; - struct xnn_qs16_qs8_cvt_params qs16_qs8_cvt; - struct xnn_qs8_f32_cvt_params qs8_f32_cvt; - union xnn_qs8_hswish_params qs8_hswish; - struct xnn_qs8_lrelu_params qs8_lrelu; - struct xnn_qu8_cvt_params qu8_cvt; - struct xnn_qu8_f32_cvt_params qu8_f32_cvt; - union xnn_qu8_hswish_params qu8_hswish; - struct xnn_qu8_lrelu_params qu8_lrelu; - struct xnn_s8_minmax_params s8_minmax; - struct xnn_u8_minmax_params u8_minmax; - } params; + union xnn_unary_uparams params; }; #ifndef __cplusplus @@ -1314,34 +1284,7 @@ struct univector_contiguous_context { uint16_t log2_xsize; uint16_t log2_ysize; xnn_vunary_ukernel_fn ukernel; - union { - struct xnn_f16_default_params f16_default; - struct xnn_f16_hswish_params f16_hswish; - struct xnn_f16_lrelu_params f16_lrelu; - union xnn_f16_minmax_params f16_minmax; - struct xnn_f16_sigmoid_params f16_sigmoid; - struct xnn_f32_default_params f32_default; - struct xnn_f32_elu_params f32_elu; - struct xnn_f32_hswish_params f32_hswish; - struct xnn_f32_lrelu_params f32_lrelu; - union xnn_f32_minmax_params f32_minmax; - struct xnn_f32_qs8_cvt_params f32_qs8_cvt; - struct xnn_f32_qu8_cvt_params f32_qu8_cvt; - struct xnn_f32_rnd_params f32_rnd; - struct xnn_f32_sigmoid_params f32_sigmoid; - struct xnn_f32_sqrt_params f32_sqrt; - struct xnn_qs8_cvt_params qs8_cvt; - struct xnn_qs16_qs8_cvt_params qs16_qs8_cvt; - struct xnn_qs8_f32_cvt_params qs8_f32_cvt; - union xnn_qs8_hswish_params qs8_hswish; - struct xnn_qs8_lrelu_params qs8_lrelu; - struct xnn_qu8_cvt_params qu8_cvt; - struct xnn_qu8_f32_cvt_params qu8_f32_cvt; - union xnn_qu8_hswish_params qu8_hswish; - struct xnn_qu8_lrelu_params qu8_lrelu; - struct xnn_s8_minmax_params s8_minmax; - struct xnn_u8_minmax_params u8_minmax; - } params; + union xnn_unary_uparams params; }; #ifndef __cplusplus @@ -1483,7 +1426,7 @@ struct f16_qd8_convert_context { struct xnn_qd8_quantization_params* quantization_params; xnn_reduce_ukernel_fn rminmax_ukernel; xnn_vunary_ukernel_fn convert_ukernel; - xnn_init_f16_qs8_cvt_params_fn init_params; + xnn_init_unary_uparams_fn init_params; union { struct xnn_f16_default_params f16_default; } params; @@ -1499,7 +1442,7 @@ struct f32_qd8_convert_context { struct xnn_qd8_quantization_params* quantization_params; xnn_reduce_ukernel_fn rminmax_ukernel; xnn_vunary_ukernel_fn convert_ukernel; - xnn_init_f32_qs8_cvt_params_fn init_params; + xnn_init_unary_uparams_fn init_params; union { struct xnn_f32_default_params f32_default; } params; @@ -1724,10 +1667,7 @@ struct scaled_dot_product_attention_context { struct xnn_f16_default_params f16; struct xnn_f32_default_params f32; } rmax_params; - union { - union xnn_f16_tanh_params f16; - union xnn_f32_tanh_params f32; - } tanh_params; + union xnn_unary_uparams tanh_params; // Attention uses a single workspace for multiple intermediates: // - scaled query diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index c60c268c6600..489c8ecb07f3 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -57,46 +57,7 @@ struct xnn_binary_elementwise_config { struct xnn_unary_elementwise_config { xnn_vunary_ukernel_fn ukernel; - union { - xnn_init_s32_f32_cvt_params_fn s32_f32_cvt; - xnn_init_u32_f32_cvt_params_fn u32_f32_cvt; - xnn_init_f16_qs8_cvt_params_fn f16_qs8_cvt; - xnn_init_f16_default_params_fn f16_default; - xnn_init_f16_elu_params_fn f16_elu; - xnn_init_f16_hswish_params_fn f16_hswish; - xnn_init_f16_lrelu_params_fn f16_lrelu; - xnn_init_f16_minmax_params_fn f16_minmax; - xnn_init_f16_rsqrt_params_fn f16_rsqrt; - xnn_init_f16_sigmoid_params_fn f16_sigmoid; - xnn_init_f16_sqrt_params_fn f16_sqrt; - xnn_init_f16_tanh_params_fn f16_tanh; - xnn_init_f32_default_params_fn f32_default; - xnn_init_f32_elu_params_fn f32_elu; - xnn_init_f32_exp_params_fn f32_exp; - xnn_init_f32_hswish_params_fn f32_hswish; - xnn_init_f32_log_params_fn f32_log; - xnn_init_f32_lrelu_params_fn f32_lrelu; - xnn_init_f32_minmax_params_fn f32_minmax; - xnn_init_f32_qs8_cvt_params_fn f32_qs8_cvt; - xnn_init_f32_qu8_cvt_params_fn f32_qu8_cvt; - xnn_init_f32_rnd_params_fn f32_rnd; - xnn_init_f32_rsqrt_params_fn f32_rsqrt; - xnn_init_f32_sigmoid_params_fn f32_sigmoid; - xnn_init_f32_sqrt_params_fn f32_sqrt; - xnn_init_f32_tanh_params_fn f32_tanh; - xnn_init_qs8_cvt_params_fn qs8_cvt; - xnn_init_qs8_f16_cvt_params_fn qs8_f16_cvt; - xnn_init_qs8_f32_cvt_params_fn qs8_f32_cvt; - xnn_init_qs8_hswish_params_fn qs8_hswish; - xnn_init_qs8_lrelu_params_fn qs8_lrelu; - xnn_init_qs16_qs8_cvt_params_fn qs16_qs8_cvt; - xnn_init_qu8_cvt_params_fn qu8_cvt; - xnn_init_qu8_f32_cvt_params_fn qu8_f32_cvt; - xnn_init_qu8_hswish_params_fn qu8_hswish; - xnn_init_qu8_lrelu_params_fn qu8_lrelu; - xnn_init_s8_minmax_params_fn s8_minmax; - xnn_init_u8_minmax_params_fn u8_minmax; - } init; + xnn_init_unary_uparams_fn init; }; struct xnn_reduce_config { @@ -105,7 +66,6 @@ struct xnn_reduce_config { union { xnn_init_qs8_reduce_minmax_params_fn qs8_reduce; xnn_init_qu8_reduce_minmax_params_fn qu8_reduce; - xnn_init_f32_qs8_cvt_params_fn f32_qs8_cvt; xnn_init_f16_f32acc_scale_params_fn f16_f32acc_scale; xnn_init_f16_default_params_fn f16_default; xnn_init_f32_default_params_fn f32_default; diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index bd1a2108b70c..25e95268e1f1 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1644,7 +1644,7 @@ typedef void (*xnn_vunary_ukernel_fn)( size_t batch, const void* input, void* output, - const void* params); + const union xnn_unary_uparams* params); // VABS: Vector ABSolute value elementwise @@ -2384,15 +2384,11 @@ typedef size_t (*xnn_init_binary_params_fn)( const struct xnn_quantization_params* b_quantization, const struct xnn_quantization_params* output_quantization); -typedef size_t (*xnn_init_f16_qs8_cvt_params_fn)( - struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 scale, - int8_t output_zero_point); - -typedef size_t (*xnn_init_f32_qs8_cvt_params_fn)( - struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - int8_t output_zero_point); +typedef size_t (*xnn_init_unary_uparams_fn)( + union xnn_unary_uparams* microparams, + const union xnn_unary_params* op_params, + const struct xnn_quantization_params* input_quantization, + const struct xnn_quantization_params* output_quantization); typedef size_t (*xnn_init_qs8_reduce_minmax_params_fn)( struct xnn_qs8_reduce_minmax_params params[XNN_MIN_ELEMENTS(1)], @@ -2408,51 +2404,6 @@ typedef size_t (*xnn_init_qu8_reduce_minmax_params_fn)( uint8_t input_zero_point, uint8_t output_zero_point); -typedef size_t (*xnn_init_f32_qu8_cvt_params_fn)( - struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - uint8_t output_zero_point); - -typedef size_t (*xnn_init_s32_f32_cvt_params_fn)( - struct xnn_s32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - int32_t zero_point); - -typedef size_t (*xnn_init_u32_f32_cvt_params_fn)( - struct xnn_u32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - int32_t zero_point); - -typedef size_t (*xnn_init_qs8_cvt_params_fn)( - struct xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - int8_t input_zero_point, - int8_t output_zero_point); - -typedef size_t (*xnn_init_qs8_f16_cvt_params_fn)( - struct xnn_qs8_f16_cvt_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 scale, - int8_t zero_point); - -typedef size_t (*xnn_init_qs8_f32_cvt_params_fn)( - struct xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - int8_t zero_point); - -typedef size_t (*xnn_init_qs16_qs8_cvt_params_fn)( - struct xnn_qs16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - int8_t zero_point); - -typedef size_t (*xnn_init_qu8_cvt_params_fn)( - struct xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - uint8_t input_zero_point, - uint8_t output_zero_point); - -typedef size_t (*xnn_init_qu8_f32_cvt_params_fn)( - struct xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - uint8_t zero_point); - typedef size_t (*xnn_init_qs8_qc8w_conv_minmax_params_fn)( union xnn_qs8_qc8w_conv_minmax_params params[XNN_MIN_ELEMENTS(1)], int8_t output_zero_point, @@ -2544,69 +2495,6 @@ typedef size_t (*xnn_init_f32_expminus_params_fn)( typedef size_t (*xnn_init_s32_default_params_fn)( struct xnn_s32_default_params params[XNN_MIN_ELEMENTS(1)]); -typedef size_t (*xnn_init_f16_elu_params_fn)( - struct xnn_f16_elu_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 prescale, - xnn_float16 alpha, - xnn_float16 beta); - -typedef size_t (*xnn_init_f32_exp_params_fn)( - struct xnn_f32_default_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_elu_params_fn)( - struct xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)], - float prescale, - float alpha, - float beta); - -typedef size_t (*xnn_init_f16_hswish_params_fn)( - struct xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_hswish_params_fn)( - struct xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_qs8_hswish_params_fn)( - union xnn_qs8_hswish_params params[XNN_MIN_ELEMENTS(1)], - int16_t input_zero_point, - int16_t output_zero_point, - float input_scale, - float output_scale); - -typedef size_t (*xnn_init_qu8_hswish_params_fn)( - union xnn_qu8_hswish_params params[XNN_MIN_ELEMENTS(1)], - int16_t input_zero_point, - int16_t output_zero_point, - float input_scale, - float output_scale); - -typedef size_t (*xnn_init_f16_lrelu_params_fn)( - struct xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 slope); - -typedef size_t (*xnn_init_f32_lrelu_params_fn)( - struct xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float slope); - -typedef size_t (*xnn_init_f32_log_params_fn)( - struct xnn_f32_default_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_relu_params_fn)( - struct xnn_f32_relu_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_qs8_lrelu_params_fn)( - struct xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float positive_slope, - float negative_slope, - int8_t input_zero_point, - int8_t output_zero_point); - -typedef size_t (*xnn_init_qu8_lrelu_params_fn)( - struct xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float positive_slope, - float negative_slope, - uint8_t input_zero_point, - uint8_t output_zero_point); - typedef size_t (*xnn_init_bf16_minmax_params_fn)( struct xnn_bf16_minmax_params params[XNN_MIN_ELEMENTS(1)], xnn_bfloat16 min, @@ -2658,12 +2546,6 @@ typedef size_t (*xnn_init_u8_minmax_params_fn)( uint8_t min, uint8_t max); -typedef size_t (*xnn_init_f16_rnd_params_fn)( - struct xnn_f16_rnd_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_rnd_params_fn)( - struct xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]); - typedef size_t (*xnn_init_f16_scale_params_fn)( struct xnn_f16_scale_params params[XNN_MIN_ELEMENTS(1)], xnn_float16 scale); @@ -2696,30 +2578,6 @@ typedef void (*xnn_update_f32_scaleminmax_params_fn)( struct xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], float scale); -typedef size_t (*xnn_init_f16_sigmoid_params_fn)( - struct xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_sigmoid_params_fn)( - struct xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f16_sqrt_params_fn)( - struct xnn_f16_sqrt_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_sqrt_params_fn)( - struct xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f16_rsqrt_params_fn)( - struct xnn_f16_rsqrt_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_rsqrt_params_fn)( - struct xnn_f32_rsqrt_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f16_tanh_params_fn)( - union xnn_f16_tanh_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f32_tanh_params_fn)( - union xnn_f32_tanh_params params[XNN_MIN_ELEMENTS(1)]); - typedef void (*xnn_init_scale_params_fn)( size_t channels, size_t channels_tile, diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index 9da16a7375ef..5dcb7c28cde1 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -329,77 +329,63 @@ XNN_INTERNAL size_t xnn_init_f32_qb4w_minmax_scalar_params( uint8_t kernel_zero_point, size_t blocksize); -#define DECLARE_INIT_QS8_HSWISH_PARAMS_FUNCTION(fn_name) \ +#define DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ - union xnn_qs8_hswish_params params[XNN_MIN_ELEMENTS(1)], \ - int16_t input_zero_point, \ - int16_t output_zero_point, \ - float input_scale, \ - float output_scale); + union xnn_unary_uparams* microparams, \ + const union xnn_unary_params* op_params, \ + const struct xnn_quantization_params* input_quantization, \ + const struct xnn_quantization_params* output_quantization); -DECLARE_INIT_QS8_HSWISH_PARAMS_FUNCTION(xnn_init_qs8_hswish_scalar_params) +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs8_hswish_scalar_params) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_QS8_HSWISH_PARAMS_FUNCTION(xnn_init_qs8_hswish_sse2_params) + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs8_hswish_sse2_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#define DECLARE_INIT_QU8_HSWISH_PARAMS_FUNCTION(fn_name) \ - XNN_INTERNAL size_t fn_name( \ - union xnn_qu8_hswish_params params[XNN_MIN_ELEMENTS(1)], \ - int16_t input_zero_point, \ - int16_t output_zero_point, \ - float input_scale, \ - float output_scale); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qu8_hswish_scalar_params) +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qu8_hswish_sse2_params) +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -DECLARE_INIT_QU8_HSWISH_PARAMS_FUNCTION(xnn_init_qu8_hswish_scalar_params) #if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_QU8_HSWISH_PARAMS_FUNCTION(xnn_init_qu8_hswish_sse2_params) + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f16_tanh_avx_expm1minus_rr1_p3h2_params) + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f16_tanh_avx_polynomial_p19h9t2_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -XNN_INTERNAL size_t xnn_init_f16_elu_scalar_params( - struct xnn_f16_elu_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 prescale, - xnn_float16 alpha, - xnn_float16 beta); - -XNN_INTERNAL size_t xnn_init_f32_elu_scalar_params( - struct xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)], - float prescale, - float alpha, - float beta); - -XNN_INTERNAL size_t xnn_init_f16_lrelu_scalar_params( - struct xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 slope); - -XNN_INTERNAL size_t xnn_init_f32_lrelu_scalar_params( - struct xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float slope); - -XNN_INTERNAL size_t xnn_init_qs8_lrelu_scalar_params( - struct xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float positive_scale, - float negative_scale, - int8_t input_zero_point, - int8_t output_zero_point); - -XNN_INTERNAL size_t xnn_init_qu8_lrelu_scalar_params( - struct xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)], - float positive_scale, - float negative_scale, - uint8_t input_zero_point, - uint8_t output_zero_point); - -XNN_INTERNAL size_t xnn_init_f16_minmax_binary_params( - union xnn_f16_minmax_params uparams[XNN_MIN_ELEMENTS(1)], - const struct xnn_quantization_params* a_quantization, - const struct xnn_quantization_params* b_quantization, - const struct xnn_quantization_params* output_quantization); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_scalar_expm1minus_rr1_lut8_p4h3_params) +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_scalar_expm1minus_rr1_p6h5_params) +#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_wasmsimd_expm1minus_rr1_lut8_p4h3_abs_params) + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_wasmsimd_expm1minus_rr1_p6h5_abs_params) + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_wasmsimd_expm1minus_rr1_lut8_p4h3_nabs_params) + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_wasmsimd_expm1minus_rr1_p6h5_nabs_params) +#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_ARM || XNN_ARCH_ARM64 + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_neon_expm1minus_rr1_lut8_p4h3_params) + DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_tanh_neon_expm1minus_rr1_p6h5_params) +#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_INTERNAL size_t xnn_init_f32_minmax_binary_params( - union xnn_f32_minmax_params uparams[XNN_MIN_ELEMENTS(1)], - const struct xnn_quantization_params* a_quantization, - const struct xnn_quantization_params* b_quantization, - const struct xnn_quantization_params* output_quantization); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f16_elu_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_elu_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f16_lrelu_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_lrelu_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs8_lrelu_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qu8_lrelu_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f16_clamp_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_clamp_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs8_clamp_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qu8_clamp_scalar_params); + +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f16_qs8_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_qs8_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_f32_qu8_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs8_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs16_qs8_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs8_f16_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qs8_f32_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qu8_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_qu8_f32_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_s32_f32_cvt_scalar_params); +DECLARE_INIT_UNARY_MICROPARAMS_FUNCTION(xnn_init_u32_f32_cvt_scalar_params); XNN_INTERNAL size_t xnn_init_qs8_add_minmax_scalar_params( struct xnn_qs8_add_minmax_params uparams[XNN_MIN_ELEMENTS(1)], @@ -439,61 +425,6 @@ DECLARE_INIT_QS8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qs8_mul_minmax_scalar_param #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_INTERNAL size_t xnn_init_f16_qs8_cvt_scalar_params( - struct xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 scale, - int8_t zero_point); - -XNN_INTERNAL size_t xnn_init_f32_qs8_cvt_scalar_params( - struct xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - int8_t zero_point); - -XNN_INTERNAL size_t xnn_init_f32_qu8_cvt_scalar_params( - struct xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - uint8_t zero_point); - -XNN_INTERNAL size_t xnn_init_s32_f32_cvt_scalar_params( - struct xnn_s32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - int32_t zero_point); - -XNN_INTERNAL size_t xnn_init_u32_f32_cvt_scalar_params( - struct xnn_u32_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - int32_t zero_point); - -XNN_INTERNAL size_t xnn_init_qs8_cvt_scalar_params( - struct xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - int8_t input_zero_point, - int8_t output_zero_point); - -XNN_INTERNAL size_t xnn_init_qs16_qs8_cvt_scalar_params( - struct xnn_qs16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - int8_t output_zero_point); - -XNN_INTERNAL size_t xnn_init_qs8_f16_cvt_scalar_params( - struct xnn_qs8_f16_cvt_params params[XNN_MIN_ELEMENTS(1)], - xnn_float16 scale, - int8_t zero_point); - -XNN_INTERNAL size_t xnn_init_qs8_f32_cvt_scalar_params( - struct xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - int8_t zero_point); - -XNN_INTERNAL size_t xnn_init_qu8_cvt_scalar_params( - struct xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)], - float input_output_scale, - uint8_t input_zero_point, - uint8_t output_zero_point); - -XNN_INTERNAL size_t xnn_init_qu8_f32_cvt_scalar_params( - struct xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)], - float scale, - uint8_t zero_point); - #ifdef __cplusplus } // extern "C" #endif diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index 3c48b858a71f..1fafe4fc7152 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -837,6 +837,34 @@ struct xnn_x32_packb_params { char _; // Dummy member variable to comply with the C standard }; +union xnn_unary_uparams { + struct xnn_f32_qs8_cvt_params f32_qs8_cvt; + struct xnn_f32_qu8_cvt_params f32_qu8_cvt; + struct xnn_f16_qs8_cvt_params f16_qs8_cvt; + struct xnn_qs8_f32_cvt_params qs8_f32_cvt; + struct xnn_qu8_f32_cvt_params qu8_f32_cvt; + struct xnn_qs8_f16_cvt_params qs8_f16_cvt; + struct xnn_qs16_qs8_cvt_params qs16_qs8_cvt; + struct xnn_s32_f32_cvt_params s32_f32_cvt; + struct xnn_u32_f32_cvt_params u32_f32_cvt; + struct xnn_qs8_cvt_params qs8_cvt; + struct xnn_qu8_cvt_params qu8_cvt; + struct xnn_f16_elu_params f16_elu; + struct xnn_f32_elu_params f32_elu; + union xnn_f16_tanh_params f16_tanh; + union xnn_f32_tanh_params f32_tanh; + union xnn_qs8_hswish_params qs8_hswish; + union xnn_qu8_hswish_params qu8_hswish; + struct xnn_f16_lrelu_params f16_lrelu; + struct xnn_f32_lrelu_params f32_lrelu; + struct xnn_qs8_lrelu_params qs8_lrelu; + struct xnn_qu8_lrelu_params qu8_lrelu; + union xnn_f32_minmax_params f32_minmax; + union xnn_f16_minmax_params f16_minmax; + struct xnn_s8_minmax_params s8_minmax; + struct xnn_u8_minmax_params u8_minmax; +}; + struct subconvolution_params { void* weights; size_t w_stride; diff --git a/src/xnnpack/operator-type-defs.h b/src/xnnpack/operator-type-defs.h index 91e35150ba51..9c30c3390007 100644 --- a/src/xnnpack/operator-type-defs.h +++ b/src/xnnpack/operator-type-defs.h @@ -9,42 +9,27 @@ #endif XNN_ENUM_ITEM_0(xnn_operator_type_invalid, "Invalid") -XNN_ENUM_ITEM(xnn_operator_type_abs_nc_f16, "Abs (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_abs_nc_f32, "Abs (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_abs, "Abs (NC)") XNN_ENUM_ITEM(xnn_operator_type_add, "Add (ND)") XNN_ENUM_ITEM(xnn_operator_type_argmax_pooling_nhwc_f32, "ArgMax Pooling (NHWC, F32)") XNN_ENUM_ITEM(xnn_operator_type_average_pooling_nhwc_f16, "Average Pooling (NHWC, F16)") XNN_ENUM_ITEM(xnn_operator_type_average_pooling_nhwc_f32, "Average Pooling (NHWC, F32)") XNN_ENUM_ITEM(xnn_operator_type_average_pooling_nhwc_qu8, "Average Pooling (NHWC, QU8)") -XNN_ENUM_ITEM(xnn_operator_type_bankers_rounding_nc_f16, "Bankers Rounding (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_bankers_rounding_nc_f32, "Bankers Rounding (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_bankers_rounding, "Bankers Rounding (NC)") XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_f16, "Batch Matrix Multiply (NC, F16)") XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_f32, "Batch Matrix Multiply (NC, F32)") XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_qd8_f32_qc8w, "Batch Matrix Multiply (NC, QD8, F32, QC8W)") -XNN_ENUM_ITEM(xnn_operator_type_ceiling_nc_f16, "Ceiling (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_ceiling_nc_f32, "Ceiling (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_ceiling, "Ceiling (NC)") XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x8, "Channel Shuffle (NC, X8)") XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x32, "Channel Shuffle (NC, X32)") -XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_f16, "Clamp (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_f32, "Clamp (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_s8, "Clamp (NC, S8)") -XNN_ENUM_ITEM(xnn_operator_type_clamp_nc_u8, "Clamp (NC, U8)") +XNN_ENUM_ITEM(xnn_operator_type_clamp, "Clamp (NC)") XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x8, "Constant Pad (ND, X8)") XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x16, "Constant Pad (ND, X16)") XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x32, "Constant Pad (ND, X32)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f16_f32, "Convert (NC, F16, F32)") +XNN_ENUM_ITEM(xnn_operator_type_convert, "Convert") XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f16_qd8, "Convert (NC, F16, QD8)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_f16, "Convert (NC, F32, F16)") XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qd8, "Convert (NC, F32, QD8)") XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qp8, "Convert (NC, F32, QP8)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qs8, "Convert (NC, F32, QS8)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_f32_qu8, "Convert (NC, F32, QU8)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs8, "Convert (NC, QS8)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs8_f16, "Convert (NC, QS8, F16)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs8_f32, "Convert (NC, QS8, F32)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qs16_qs8, "Convert (NC, QS16, QS8)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qu8, "Convert (NC, QU8)") -XNN_ENUM_ITEM(xnn_operator_type_convert_nc_qu8_f32, "Convert (NC, QU8, F32)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nchw_f16, "Convolution (NCHW, F16)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nchw_f32, "Convolution (NCHW, F32)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_f16, "Convolution (NHWC, F16)") @@ -72,12 +57,9 @@ XNN_ENUM_ITEM(xnn_operator_type_depth_to_space_nhwc_x32, "Depth To Space (NHWC, XNN_ENUM_ITEM(xnn_operator_type_divide, "Divide (ND)") XNN_ENUM_ITEM(xnn_operator_type_dynamic_fully_connected_nc_f16, "Dynamic Fully Connected (NC, F16)") XNN_ENUM_ITEM(xnn_operator_type_dynamic_fully_connected_nc_f32, "Dynamic Fully Connected (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_elu_nc_f16, "ELU (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_elu_nc_f32, "ELU (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_elu_nc_qs8, "ELU (NC, QS8)") -XNN_ENUM_ITEM(xnn_operator_type_exp_nc_f32, "Exp (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_floor_nc_f16, "Floor (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_floor_nc_f32, "Floor (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_elu, "ELU (NC)") +XNN_ENUM_ITEM(xnn_operator_type_exp, "Exp (NC)") +XNN_ENUM_ITEM(xnn_operator_type_floor, "Floor (NC)") XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_f16, "Fully Connected (NC, F16)") XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_f32, "Fully Connected (NC, F32)") XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_f32_qc4w, "Fully Connected (NC, F32, QC4W)") @@ -95,14 +77,10 @@ XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qp8_f32_qb4w, "Fully Connecte XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qs8, "Fully Connected (NC, QS8)") XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qs8_qc8w, "Fully Connected (NC, QS8, QC8W)") XNN_ENUM_ITEM(xnn_operator_type_fully_connected_nc_qu8, "Fully Connected (NC, QU8)") -XNN_ENUM_ITEM(xnn_operator_type_gelu_nc_f32, "GELU (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_hardswish_nc_f16, "HardSwish (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_hardswish_nc_f32, "HardSwish (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_f16, "Leaky ReLU (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_f32, "Leaky ReLU (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_qs8, "Leaky ReLU (NC, QS8)") -XNN_ENUM_ITEM(xnn_operator_type_leaky_relu_nc_qu8, "Leaky ReLU (NC, QU8)") -XNN_ENUM_ITEM(xnn_operator_type_log_nc_f32, "Log (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_gelu, "GELU (NC)") +XNN_ENUM_ITEM(xnn_operator_type_hardswish, "HardSwish (NC)") +XNN_ENUM_ITEM(xnn_operator_type_leaky_relu, "Leaky ReLU (NC)") +XNN_ENUM_ITEM(xnn_operator_type_log, "Log (NC)") XNN_ENUM_ITEM(xnn_operator_type_max_pooling_nhwc_f16, "Max Pooling (NHWC, F16)") XNN_ENUM_ITEM(xnn_operator_type_max_pooling_nhwc_f32, "Max Pooling (NHWC, F32)") XNN_ENUM_ITEM(xnn_operator_type_max_pooling_nhwc_s8, "Max Pooling (NHWC, S8)") @@ -111,13 +89,11 @@ XNN_ENUM_ITEM(xnn_operator_type_maximum, "Maximum (ND)") XNN_ENUM_ITEM(xnn_operator_type_mean_nd, "Mean (ND)") XNN_ENUM_ITEM(xnn_operator_type_minimum, "Minimum (ND)") XNN_ENUM_ITEM(xnn_operator_type_multiply, "Multiply (ND)") -XNN_ENUM_ITEM(xnn_operator_type_negate_nc_f16, "Negate (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_negate_nc_f32, "Negate (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_negate, "Negate (NC)") XNN_ENUM_ITEM(xnn_operator_type_pack_lh_x32, "Pack LH (X32)") XNN_ENUM_ITEM(xnn_operator_type_prelu_nc_f16, "PReLU (NC, F16)") XNN_ENUM_ITEM(xnn_operator_type_prelu_nc_f32, "PReLU (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_reciprocal_square_root_nc_f16, "Reciprocal Square Root (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_reciprocal_square_root_nc_f32, "Reciprocal Square Root (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_reciprocal_square_root, "Reciprocal Square Root (NC)") XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nchw_f16, "Resize Bilinear (NCHW, F16)") XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nchw_f32, "Resize Bilinear (NCHW, F32)") XNN_ENUM_ITEM(xnn_operator_type_resize_bilinear_nhwc_f16, "Resize Bilinear (NHWC, F16)") @@ -128,10 +104,7 @@ XNN_ENUM_ITEM(xnn_operator_type_rope_nthc_f16, "RoPE (NTHC, F16)") XNN_ENUM_ITEM(xnn_operator_type_rope_nthc_f32, "RoPE (NTHC, F32)") XNN_ENUM_ITEM(xnn_operator_type_scaled_dot_product_attention_nhtc_f16, "Scaled Dot-Product Attention (NHTC, F16)") XNN_ENUM_ITEM(xnn_operator_type_scaled_dot_product_attention_nhtc_f32, "Scaled Dot-Product Attention (NHTC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_f16, "Sigmoid (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_f32, "Sigmoid (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_qs8, "Sigmoid (NC, QS8)") -XNN_ENUM_ITEM(xnn_operator_type_sigmoid_nc_qu8, "Sigmoid (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_sigmoid, "Sigmoid (NC)") XNN_ENUM_ITEM(xnn_operator_type_slice_nd_x8, "Slice (ND, X8)") XNN_ENUM_ITEM(xnn_operator_type_slice_nd_x16, "Slice (ND, X16)") XNN_ENUM_ITEM(xnn_operator_type_slice_nd_x32, "Slice (ND, X32)") @@ -141,23 +114,17 @@ XNN_ENUM_ITEM(xnn_operator_type_softmax_nc_qu8, "Softmax (NC, QU8)") XNN_ENUM_ITEM(xnn_operator_type_space_to_depth_nhwc_x8, "Space To Depth (NHWC, X8)") XNN_ENUM_ITEM(xnn_operator_type_space_to_depth_nhwc_x16, "Space To Depth (NHWC, X16)") XNN_ENUM_ITEM(xnn_operator_type_space_to_depth_nhwc_x32, "Space To Depth (NHWC, X32)") -XNN_ENUM_ITEM(xnn_operator_type_square_nc_f16, "Square (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_square_nc_f32, "Square (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_square_root_nc_f16, "Square Root (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_square_root_nc_f32, "Square Root (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_square, "Square (NC)") +XNN_ENUM_ITEM(xnn_operator_type_square_root, "Square Root (NC)") XNN_ENUM_ITEM(xnn_operator_type_squared_difference, "Squared Difference (NC)") XNN_ENUM_ITEM(xnn_operator_type_subtract, "Subtract (ND)") XNN_ENUM_ITEM(xnn_operator_type_sum_nd, "Sum (ND)") -XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_f16, "Tanh (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_f32, "Tanh (NC, F32)") -XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_qs8, "Tanh (NC, QS8)") -XNN_ENUM_ITEM(xnn_operator_type_tanh_nc_qu8, "Tanh (NC, QU8)") +XNN_ENUM_ITEM(xnn_operator_type_tanh, "Tanh (NC)") XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x8, "Transpose (ND, X8)") XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x16, "Transpose (ND, X16)") XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x32, "Transpose (ND, X32)") XNN_ENUM_ITEM(xnn_operator_type_transpose_nd_x64, "Transpose (ND, X64)") -XNN_ENUM_ITEM(xnn_operator_type_truncation_nc_f16, "Truncation (NC, F16)") -XNN_ENUM_ITEM(xnn_operator_type_truncation_nc_f32, "Truncation (NC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_truncation, "Truncation (NC)") XNN_ENUM_ITEM(xnn_operator_type_unpooling_nhwc_x32, "Unpooling (NHWC, X32)") diff --git a/src/xnnpack/operator-utils.h b/src/xnnpack/operator-utils.h index fc4f75ad5527..aba5a74e49d5 100644 --- a/src/xnnpack/operator-utils.h +++ b/src/xnnpack/operator-utils.h @@ -65,6 +65,8 @@ XNN_INTERNAL uint32_t xnn_get_heuristic_mr_igemm( uint32_t nr, struct xnn_hmp_igemm_ukernel *igemm_cases); +XNN_INTERNAL enum xnn_status xnn_destroy_operator(xnn_operator_t op); + #ifdef __cplusplus } #endif diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index ecfcc8ca07e7..544cefa33939 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -23,6 +23,10 @@ // Maximum number of pthreadpool parallelization invocations per operator. #define XNN_MAX_COMPUTE_INVOCATIONS 3 +#ifdef __cplusplus +extern "C" { +#endif + struct xnn_ukernel_conv2d { union { xnn_conv_hwc2chw_ukernel_fn hwc2chw_fn; @@ -209,24 +213,14 @@ struct xnn_operator { uint32_t flags; uint32_t log2_elementwise_element_size; + uint32_t log2_elementwise_input_size; + uint32_t log2_elementwise_output_size; union { union xnn_binary_uparams binary; + union xnn_unary_uparams unary; struct xnn_f16_default_params f16_default; - struct xnn_f16_hswish_params f16_hswish; - struct xnn_f16_elu_params f16_elu; - struct xnn_f16_lrelu_params f16_lrelu; - struct xnn_f16_sigmoid_params f16_sigmoid; - union xnn_f16_tanh_params f16_tanh; struct xnn_f32_default_params f32_default; - struct xnn_f32_elu_params f32_elu; - struct xnn_f32_lrelu_params f32_lrelu; - struct xnn_f32_rnd_params f32_rnd; - struct xnn_f32_rsqrt_params f32_rsqrt; - struct xnn_f32_sigmoid_params f32_sigmoid; - struct xnn_f32_sqrt_params f32_sqrt; - union xnn_f32_tanh_params f32_tanh; - struct xnn_f32_hswish_params f32_hswish; union xnn_f16_minmax_params f16_minmax; struct xnn_f16_scaleminmax_params f16_scaleminmax; struct f16_f32acc_reduce_params reduce_params; @@ -241,15 +235,6 @@ struct xnn_operator { union xnn_f32_minmax_params f32_chw; struct xnn_f32_qb4w_minmax_params f32_qb4w_minmax; struct xnn_f32_qc4w_minmax_params f32_qc4w_minmax; - struct xnn_f32_qs8_cvt_params f32_qs8_cvt; - struct xnn_f32_qu8_cvt_params f32_qu8_cvt; - struct xnn_s32_f32_cvt_params s32_f32_cvt; - struct xnn_qs8_cvt_params qs8_cvt; - struct xnn_qs8_f16_cvt_params qs8_f16_cvt; - struct xnn_qs8_f32_cvt_params qs8_f32_cvt; - struct xnn_qs16_qs8_cvt_params qs16_qs8_cvt; - struct xnn_qu8_cvt_params qu8_cvt; - struct xnn_qu8_f32_cvt_params qu8_f32_cvt; union xnn_qs8_conv_minmax_params qs8_conv_minmax; union xnn_qs8_qc8w_conv_minmax_params qs8_qc8w_conv_minmax; struct { @@ -261,10 +246,6 @@ struct xnn_operator { struct { union xnn_qu8_avgpool_minmax_params qu8_avgpool; }; - union xnn_qs8_hswish_params qs8_hswish; - union xnn_qu8_hswish_params qu8_hswish; - struct xnn_qs8_lrelu_params qs8_lrelu; - struct xnn_qu8_lrelu_params qu8_lrelu; struct xnn_s8_minmax_params s8_minmax; struct xnn_s32_default_params s32_default; struct xnn_u8_minmax_params u8_minmax; @@ -289,8 +270,7 @@ struct xnn_operator { } params3; // Fourth set of params. Used by scaled dot attention operator. union { - union xnn_f16_tanh_params f16_tanh; - union xnn_f32_tanh_params f32_tanh; + union xnn_unary_uparams unary; } params4; enum xnn_operator_type type; struct xnn_ukernel ukernel; @@ -413,7 +393,11 @@ XNN_INTERNAL enum xnn_status xnn_run_operator_with_index( size_t operator_object_index, pthreadpool_t threadpool); -XNN_INTERNAL enum xnn_operator_type xnn_binary_operator_to_operator_type( - enum xnn_binary_operator op); - +XNN_INTERNAL enum xnn_operator_type xnn_unary_operator_to_operator_type(enum xnn_unary_operator op); +XNN_INTERNAL const char* xnn_unary_operator_to_string(enum xnn_unary_operator op); +XNN_INTERNAL enum xnn_operator_type xnn_binary_operator_to_operator_type(enum xnn_binary_operator op); XNN_INTERNAL enum xnn_operator_type xnn_reduce_operator_to_operator_type(enum xnn_reduce_operator op); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/src/xnnpack/subgraph.h b/src/xnnpack/subgraph.h index d8c6eb8a1883..6e7e099e8694 100644 --- a/src/xnnpack/subgraph.h +++ b/src/xnnpack/subgraph.h @@ -212,6 +212,7 @@ enum xnn_compute_type { xnn_compute_type_qp8_to_fp32, xnn_compute_type_qs8, xnn_compute_type_qu8, + xnn_compute_type_fp16_to_qs8, xnn_compute_type_fp16_to_qd8, xnn_compute_type_fp16_to_fp32, xnn_compute_type_fp32_to_fp16, @@ -298,12 +299,6 @@ struct xnn_node { uint32_t dilation_height; uint32_t dilation_width; } pooling_2d; - struct { - float alpha; - } elu; - struct { - float negative_slope; - } leaky_relu; struct { size_t pre_paddings[XNN_MAX_TENSOR_DIMS]; size_t post_paddings[XNN_MAX_TENSOR_DIMS]; @@ -339,6 +334,7 @@ struct xnn_node { enum xnn_attention_logits_cap_type cap_type; struct xnn_attention_logits_cap_tanh_params cap_tanh_params; } scaled_dot_product_attention; + union xnn_unary_params unary; } params; struct { float output_min; @@ -587,6 +583,8 @@ enum xnn_status resize_fully_connected_output_tensor( size_t old_workspace_size, pthreadpool_t threadpool); +XNN_INTERNAL enum xnn_node_type xnn_unary_operator_to_node_type(enum xnn_unary_operator type); +XNN_INTERNAL enum xnn_unary_operator xnn_node_type_to_unary_operator(enum xnn_node_type type); XNN_INTERNAL enum xnn_node_type xnn_binary_operator_to_node_type(enum xnn_binary_operator type); XNN_INTERNAL enum xnn_binary_operator xnn_node_type_to_binary_operator(enum xnn_node_type type); diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 3e0ae92279f7..a138feeed6f1 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -92,26 +92,18 @@ xnnpack_cxx_library( ) xnnpack_cxx_library( - name = "tanh_operator_tester", + name = "unary_ops", testonly = True, - hdrs = ["tanh-operator-tester.h"], - deps = OPERATOR_TEST_DEPS + xnnpack_test_deps_for_library(), -) - -xnnpack_cxx_library( - name = "unary_operator_tester", - testonly = True, - srcs = ["unary-operator-tester.cc"], - hdrs = ["unary-operator-tester.h"], - deps = OPERATOR_TEST_DEPS + xnnpack_test_deps_for_library(), + srcs = ["unary-ops.cc"], + hdrs = ["unary-ops.h"], + deps = MICROKERNEL_TEST_DEPS + xnnpack_test_deps_for_library(), ) xnnpack_cxx_library( name = "vunary_microkernel_tester", testonly = True, - srcs = ["vunary-microkernel-tester.cc"], hdrs = ["vunary-microkernel-tester.h"], - deps = MICROKERNEL_TEST_DEPS + xnnpack_test_deps_for_library(), + deps = MICROKERNEL_TEST_DEPS + xnnpack_test_deps_for_library() + [":unary_ops"], ) xnnpack_cxx_library( @@ -132,14 +124,6 @@ xnnpack_cxx_library( ], ) -xnnpack_cxx_library( - name = "vcvt_microkernel_tester", - testonly = True, - srcs = ["vcvt-microkernel-tester.cc"], - hdrs = ["vcvt-microkernel-tester.h"], - deps = MICROKERNEL_TEST_DEPS + xnnpack_test_deps_for_library(), -) - xnnpack_cxx_library( name = "rdsum_microkernel_tester", testonly = True, @@ -310,6 +294,23 @@ sh_test( "f32_vtanh", "s8_vclamp", "u8_vclamp", + "qs8_vhswish", + "qs8_vlrelu", + "qu8_vhswish", + "qu8_vlrelu", + "f16_f32_vcvt", + "f16_qs8_vcvt", + "f32_f16_vcvt", + "f32_qs8_vcvt", + "f32_qu8_vcvt", + "s32_f32_vcvt", + "u32_f32_vcvt", + "qs16_qs8_vcvt", + "qs8_f16_vcvt", + "qs8_f32_vcvt", + "qs8_vcvt", + "qu8_vcvt", + "qu8_f32_vcvt", ]] [xnnpack_unit_test( @@ -375,28 +376,6 @@ sh_test( "s32_vmulc", ]] -[xnnpack_unit_test( - name = "%s_test" % kernel, - srcs = [ - "%s.cc" % kernel.replace("_", "-"), - ], - deps = MICROKERNEL_TEST_DEPS + [":vcvt_microkernel_tester"], -) for kernel in [ - "f16_f32_vcvt", - "f16_qs8_vcvt", - "f32_f16_vcvt", - "f32_qs8_vcvt", - "f32_qu8_vcvt", - "s32_f32_vcvt", - "u32_f32_vcvt", - "qs16_qs8_vcvt", - "qs8_f16_vcvt", - "qs8_f32_vcvt", - "qs8_vcvt", - "qu8_vcvt", - "qu8_f32_vcvt", -]] - [xnnpack_unit_test( name = "%s_test" % kernel, srcs = [ @@ -1019,24 +998,6 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS + [":rdsum_microkernel_tester"], ) -xnnpack_unit_test( - name = "qs8_vhswish_test", - srcs = [ - "qs8-vhswish.cc", - "vhswish-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "qs8_vlrelu_test", - srcs = [ - "qs8-vlrelu.cc", - "vlrelu-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "qu8_gemm_minmax_fp32_test", srcs = [ @@ -1081,24 +1042,6 @@ xnnpack_unit_test( ], ) -xnnpack_unit_test( - name = "qu8_vhswish_test", - srcs = [ - "qu8-vhswish.cc", - "vhswish-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "qu8_vlrelu_test", - srcs = [ - "qu8-vlrelu.cc", - "vlrelu-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "u8_lut32norm_test", srcs = [ @@ -1223,34 +1166,17 @@ xnnpack_binary( ########################### Unit tests for operators ########################## -[xnnpack_unit_test( - name = "%s_test" % operator, - srcs = [ - "%s.cc" % operator.replace("_", "-"), - ], +xnnpack_unit_test( + name = "unary_elementwise_nc_test", + timeout = "long", + srcs = ["unary-elementwise-nc.cc"], deps = OPERATOR_TEST_DEPS + [ - ":unary_operator_tester", + ":unary_ops", + "//:logging", + "//:operator_h", + "//:operator_utils", ], -) for operator in [ - "abs_nc", - "bankers_rounding_nc", - "ceiling_nc", - "clamp_nc", - "elu_nc", - "exp_nc", - "floor_nc", - "gelu_nc", - "hardswish_nc", - "leaky_relu_nc", - "log_nc", - "negate_nc", - "reciprocal_square_root_nc", - "sigmoid_nc", - "square_nc", - "square_root_nc", - "tanh_nc", - "truncation_nc", -]] +) xnnpack_unit_test( name = "binary_elementwise_nd_test", @@ -1320,21 +1246,10 @@ xnnpack_unit_test( xnnpack_unit_test( name = "convert_nc_test", - timeout = "moderate", srcs = [ "convert-nc.cc", "convert-operator-tester.h", ], - shard_count = 5, - deps = OPERATOR_TEST_DEPS + ["//:microkernels_h"], -) - -xnnpack_unit_test( - name = "convert_nc_eager_test", - srcs = [ - "convert-nc-eager.cc", - "convert-operator-tester.h", - ], deps = OPERATOR_TEST_DEPS + ["//:microkernels_h"], ) @@ -1623,6 +1538,23 @@ xnnpack_cxx_library( ], ) +xnnpack_unit_test( + name = "unary_test", + srcs = [ + "unary.cc", + ], + deps = [ + ":replicable_random_device", + ":unary_ops", + "//:XNNPACK", + "//:buffer", + "//:logging", + "//:math", + "//:operators", + "//:subgraph", + ], +) + [xnnpack_unit_test( name = "%s_test" % operator, srcs = [ @@ -1631,7 +1563,6 @@ xnnpack_cxx_library( deps = [ ":replicable_random_device", ":subgraph_unary_tester", - ":tanh_operator_tester", "//:XNNPACK", "//:math", "//:node_type", @@ -1640,32 +1571,14 @@ xnnpack_cxx_library( "//:subgraph", ], ) for operator in [ - "abs", - "bankers_rounding", - "ceiling", - "clamp", - "convert", "copy", - "elu", - "exp", - "floor", - "gelu", - "hardswish", - "leaky_relu", - "log", - "negate", - "reciprocal_square_root", - "sigmoid", "softmax", "space_to_depth_2d", - "square", - "square_root", "static_constant_pad", "static_expand_dims", "static_reshape", "static_slice", "static_transpose", - "tanh", ]] xnnpack_cxx_library( @@ -1700,18 +1613,6 @@ xnnpack_unit_test( ], ) -xnnpack_unit_test( - name = "abs_reshape_test", - srcs = [ - "abs-reshape.cc", - ], - deps = [ - "//:XNNPACK", - "//:node_type", - "//:subgraph", - ], -) - xnnpack_unit_test( name = "binary_test", srcs = ["binary.cc"], diff --git a/test/abs-nc.cc b/test/abs-nc.cc deleted file mode 100644 index c7cfad01e4b6..000000000000 --- a/test/abs-nc.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class AbsOperatorTester : public UnaryOperatorTester { - public: - AbsOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-1.0f, 1.0f}; - range_f16_ = {-1.0f, 1.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::abs(x); } - - CREATE_OP_OVERRIDES_F32(abs); - CREATE_OP_OVERRIDES_F16(abs); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, AbsOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, AbsOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, AbsOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/abs-reshape.cc b/test/abs-reshape.cc deleted file mode 100644 index 06ceeb5bad21..000000000000 --- a/test/abs-reshape.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/subgraph.h" - -TEST(AbsTestF32, Reshape) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - std::vector dims{2, 3, 4}; - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_abs(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_abs); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, subgraph->num_values, /*threadpool=*/nullptr), xnn_status_success); - - dims[0] = 7; - ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, 0, dims.size(), dims.data())); - - ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required); - const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; - const size_t num_input_elements = std::accumulate(dims.cbegin(), dims.cend(), size_t{1}, std::multiplies()); - ASSERT_EQ(output_shape->dim[0], dims[0]); - ASSERT_EQ(runtime->values[node->outputs[0]].size, num_input_elements * sizeof(float)); -} diff --git a/test/bankers-rounding-nc.cc b/test/bankers-rounding-nc.cc deleted file mode 100644 index 98e2c800eb3b..000000000000 --- a/test/bankers-rounding-nc.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class BankersRoundingOperatorTester : public UnaryOperatorTester { - public: - BankersRoundingOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-5.0f, 5.0f}; - range_f16_ = {-5.0f, 5.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::nearbyint(x); } - - CREATE_OP_OVERRIDES_F32(bankers_rounding); - CREATE_OP_OVERRIDES_F16(bankers_rounding); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, BankersRoundingOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, BankersRoundingOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, BankersRoundingOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/bankers-rounding.cc b/test/bankers-rounding.cc deleted file mode 100644 index 77f508a8f449..000000000000 --- a/test/bankers-rounding.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using BankersRoundingTestF16 = UnaryTest; -using BankersRoundingTestF32 = UnaryTest; - -TEST_F(BankersRoundingTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_bankers_rounding(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_bankers_rounding); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(BankersRoundingTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_bankers_rounding(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_bankers_rounding); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(BankersRoundingTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-5.0f, 5.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_bankers_rounding_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_bankers_rounding_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_bankers_rounding_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_bankers_rounding(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(BankersRoundingTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-5.0f, 5.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_bankers_rounding_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_bankers_rounding_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_bankers_rounding_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_bankers_rounding(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/batch-matrix-multiply.cc b/test/batch-matrix-multiply.cc index c3f1dec9ad3f..e741dab86dd9 100644 --- a/test/batch-matrix-multiply.cc +++ b/test/batch-matrix-multiply.cc @@ -692,7 +692,7 @@ TEST_F(BatchMatrixMultiplyTestQD8ToF32, matches_operator_api) { ASSERT_NE(output_id, XNN_INVALID_VALUE_ID); // Define the ops. - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input1_f32_id, + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input1_f32_id, input1_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_batch_matrix_multiply(subgraph, input1_id, input2_id, diff --git a/test/bf16-vabs.cc b/test/bf16-vabs.cc index 9d925bb13320..2e03d490239d 100644 --- a/test/bf16-vabs.cc +++ b/test/bf16-vabs.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); +using TestInfo = Abs; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "bf16-vabs/bf16-vabs.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/ceiling-nc.cc b/test/ceiling-nc.cc deleted file mode 100644 index 89bf8ff10c1a..000000000000 --- a/test/ceiling-nc.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class CeilingOperatorTester : public UnaryOperatorTester { - public: - CeilingOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-1.0f, 1.0f}; - range_f16_ = {-5.0f, -0.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::ceil(x); } - - CREATE_OP_OVERRIDES_F32(ceiling); - CREATE_OP_OVERRIDES_F16(ceiling); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, CeilingOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, CeilingOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, CeilingOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/ceiling.cc b/test/ceiling.cc deleted file mode 100644 index 13682a681b2c..000000000000 --- a/test/ceiling.cc +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using CeilingTestF16 = UnaryTest; -using CeilingTestF32 = UnaryTest; - -TEST_F(CeilingTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_ceiling(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_ceiling); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(CeilingTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_ceiling(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_ceiling); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(CeilingTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-5.0f, -0.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_ceiling_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_ceiling_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_ceiling_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_ceiling(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(CeilingTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-5.0f, -0.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_ceiling_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_ceiling_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_ceiling_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_ceiling(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/clamp-nc.cc b/test/clamp-nc.cc deleted file mode 100644 index 8d80289f7269..000000000000 --- a/test/clamp-nc.cc +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "unary-operator-tester.h" -#include "pthreadpool.h" - -namespace xnnpack { - -#define xnn_reshape_clamp_nc_qs8 xnn_reshape_clamp_nc_s8 -#define xnn_reshape_clamp_nc_qu8 xnn_reshape_clamp_nc_u8 -#define xnn_setup_clamp_nc_qs8 xnn_setup_clamp_nc_s8 -#define xnn_setup_clamp_nc_qu8 xnn_setup_clamp_nc_u8 - -class ClampOperatorTester : public UnaryOperatorTester { - public: - ClampOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-10.0f, 10.0f}; - range_f16_ = {-10.0f, 10.0f}; - input_scale(1.0f); - input_zero_point(128); - output_scale(1.0f); - output_zero_point(128); - } - - ClampOperatorTester& relu_activation(bool relu_activation) { - relu_activation_ = relu_activation; - return *this; - } - ClampOperatorTester& clamp_low(bool clamp_low) { - clamp_low_ = clamp_low; - return *this; - } - ClampOperatorTester& clamp_high(bool clamp_high) { - clamp_high_ = clamp_high; - return *this; - } - - bool relu_activation() const { return relu_activation_; } - float clamp_low() const { return clamp_low_; } - float clamp_high() const { return clamp_high_; } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { - return relu_activation() ? std::max(x, 0.f) - : std::min(std::max(x, clamp_low()), clamp_high()); - } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float) const override { return 5e-6f; } - float AbsTolF16(float y_ref) const override { - return std::max(1.0e-4f, std::abs(y_ref) * 5.0e-3f); - } - float AbsTolQS8(float) const override { return 0.6f; }; - float AbsTolQU8(float) const override { return 0.6f; }; - - xnn_status CreateOpF32(uint32_t flags, - xnn_operator_t* op_out) const override { - const float output_min = relu_activation() ? 0.0f : clamp_low(); - const float output_max = relu_activation() - ? std::numeric_limits::infinity() - : clamp_high(); - return xnn_create_clamp_nc_f32(output_min, output_max, 0, op_out); - } - - xnn_status RunOpF32(size_t channels, size_t input_stride, - size_t output_stride, size_t batch_size, - const float* input, float* output, uint32_t flags, - pthreadpool_t threadpool) const override { - const float output_min = relu_activation() ? 0.0f : clamp_low(); - const float output_max = relu_activation() - ? std::numeric_limits::infinity() - : clamp_high(); - - return xnn_run_clamp_nc_f32(channels, input_stride, output_stride, - batch_size, input, output, output_min, - output_max, flags, threadpool); - } - xnn_status CreateOpF16(uint32_t flags, - xnn_operator_t* op_out) const override { - const float output_min = relu_activation() ? 0.0f : clamp_low(); - const float output_max = relu_activation() - ? std::numeric_limits::infinity() - : clamp_high(); - return xnn_create_clamp_nc_f16(output_min, output_max, 0, op_out); - } - xnn_status CreateOpQS8(int8_t input_zero_point, float input_scale, - int8_t output_zero_point, float output_scale, - int8_t output_min, int8_t output_max, uint32_t flags, - xnn_operator_t* op_out) const override { - int8_t q_low = - static_cast(clamp_low() / output_scale + output_zero_point); - int8_t q_high = - static_cast(clamp_high() / output_scale + output_zero_point); - return xnn_create_clamp_nc_s8(q_low, q_high, 0, op_out); - } - xnn_status CreateOpQU8(uint8_t input_zero_point, float input_scale, - uint8_t output_zero_point, float output_scale, - uint8_t output_min, uint8_t output_max, uint32_t flags, - xnn_operator_t* op_out) const override { - uint8_t q_low = - static_cast(clamp_low() / output_scale + output_zero_point); - uint8_t q_high = - static_cast(clamp_high() / output_scale + output_zero_point); - return xnn_create_clamp_nc_u8(q_low, q_high, 0, op_out); - } - - CREATE_OP_RESHAPE_OVERRIDE_F32(clamp); - CREATE_OP_SETUP_OVERRIDE_F32(clamp); - - CREATE_OP_RESHAPE_OVERRIDE_F16(clamp); - CREATE_OP_SETUP_OVERRIDE_F16(clamp); - - CREATE_OP_RESHAPE_OVERRIDE_QS8(clamp); - CREATE_OP_SETUP_OVERRIDE_QS8(clamp); - - CREATE_OP_RESHAPE_OVERRIDE_QU8(clamp); - CREATE_OP_SETUP_OVERRIDE_QU8(clamp); - - private: - bool relu_activation_ = false; - float clamp_low_ = -5.0f; - float clamp_high_ = 5.0f; -}; - -CREATE_UNARY_FLOAT_TESTS(F32, ClampOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, ClampOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, ClampOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -CREATE_UNARY_TEST(QS8, ClampOperatorTester) -INSTANTIATE_TEST_SUITE_P( - datatype, ClampOperatorTesterQS8, - testing::ValuesIn({ - UnaryOpTestParams::UnitBatch(), - UnaryOpTestParams::SmallBatch(), - UnaryOpTestParams::SmallBatch().InputStride(129), - UnaryOpTestParams::SmallBatch().OutputStride(117), - }), - [](const testing::TestParamInfo& info) { - return info.param.ToString(); - }); - -CREATE_UNARY_TEST(QU8, ClampOperatorTester) -INSTANTIATE_TEST_SUITE_P( - datatype, ClampOperatorTesterQU8, - testing::ValuesIn({ - UnaryOpTestParams::UnitBatch(), - UnaryOpTestParams::SmallBatch(), - UnaryOpTestParams::SmallBatch().InputStride(129), - UnaryOpTestParams::SmallBatch().OutputStride(117), - }), - [](const testing::TestParamInfo& info) { - return info.param.ToString(); - }); - -#ifndef XNN_EXCLUDE_F16_TESTS -TEST(CLAMP_NC_F16, unit_batch_with_clamp_min) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int32_t clamp_low = std::numeric_limits::min() + 16; - clamp_low < std::numeric_limits::max() - 16; - clamp_low += 257) { - ClampOperatorTester() - .clamp_low(clamp_low) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF16(); - } - } -} -TEST(CLAMP_NC_F16, unit_batch_with_clamp_max) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int32_t clamp_high = std::numeric_limits::min() + 16; - clamp_high < std::numeric_limits::max() - 16; - clamp_high += 257) { - ClampOperatorTester() - .clamp_high(clamp_high) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF16(); - } - } -} -#endif // XNN_EXCLUDE_F16_TESTS - -TEST(CLAMP_NC_F32, unit_batch_with_clamp_low) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int32_t clamp_low = std::numeric_limits::min() + 1; - clamp_low < std::numeric_limits::max(); clamp_low += 257) { - ClampOperatorTester() - .clamp_low(clamp_low) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF32(); - } - } -} -TEST(CLAMP_NC_F32, unit_batch_with_clamp_high) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int32_t clamp_high = std::numeric_limits::min() + 1; - clamp_high < std::numeric_limits::max(); clamp_high += 257) { - ClampOperatorTester() - .clamp_high(clamp_high) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF32(); - } - } -} -TEST(CLAMP_NC_F32, unit_batch_with_relu) { - for (size_t channels = 1; channels < 100; channels++) { - ClampOperatorTester() - .relu_activation(true) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF32(); - } -} - -TEST(CLAMP_NC_S8, unit_batch_with_clamp_low) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int32_t clamp_low = std::numeric_limits::min() + 1; - clamp_low < std::numeric_limits::max(); clamp_low++) { - ClampOperatorTester() - .clamp_low(clamp_low) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQS8(); - } - } -} -TEST(CLAMP_NC_S8, unit_batch_with_clamp_high) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int32_t clamp_high = std::numeric_limits::min() + 1; - clamp_high < std::numeric_limits::max(); clamp_high++) { - ClampOperatorTester() - .clamp_high(clamp_high) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQS8(); - } - } -} - -TEST(CLAMP_NC_U8, unit_batch_with_clamp_low) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int32_t clamp_low = std::numeric_limits::min() + 1; - clamp_low < std::numeric_limits::max(); clamp_low++) { - ClampOperatorTester() - .clamp_low(clamp_low) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQU8(); - } - } -} -TEST(CLAMP_NC_U8, unit_batch_with_clamp_high) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (uint8_t clamp_high = 1; clamp_high < 255; clamp_high++) { - ClampOperatorTester() - .clamp_high(clamp_high) - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQU8(); - } - } -} - -}; // namespace xnnpack diff --git a/test/clamp.cc b/test/clamp.cc deleted file mode 100644 index aa03eda65a7f..000000000000 --- a/test/clamp.cc +++ /dev/null @@ -1,424 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using ClampTestQS8 = UnaryTest; -using ClampTestQU8 = UnaryTest; -using ClampTestF16 = UnaryTest; -using ClampTestF32 = UnaryTest; - -TEST_F(ClampTestQS8, define) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = input_zero_point; - const float output_scale = input_scale; - const int8_t quantized_output_min = std::uniform_int_distribution(-128, 0)(rng); - const int8_t quantized_output_max = std::uniform_int_distribution(1, 127)(rng); - const float output_min = (quantized_output_min - input_zero_point) * input_scale; - const float output_max = (quantized_output_max - input_zero_point) * input_scale; - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_clamp); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ClampTestQU8, define) -{ - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = input_zero_point; - const float output_scale = input_scale; - const uint8_t quantized_output_min = std::uniform_int_distribution(0, 127)(rng); - const uint8_t quantized_output_max = std::uniform_int_distribution(128, 255)(rng); - const float output_min = (quantized_output_min - input_zero_point) * input_scale; - const float output_max = (quantized_output_max - input_zero_point) * input_scale; - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_clamp); - ASSERT_EQ(node->compute_type, xnn_compute_type_qu8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ClampTestF16, define) -{ - const float output_min = std::uniform_real_distribution(-128.0f, 0.0f)(rng); - const float output_max = std::uniform_real_distribution(1.0f, 127.0f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_clamp); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ClampTestF32, define) -{ - const float output_min = std::uniform_real_distribution(-128.0f, 0.0f)(rng); - const float output_max = std::uniform_real_distribution(1.0f, 127.0f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_clamp); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ClampTestQS8, matches_operator_api) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = input_zero_point; - const float output_scale = input_scale; - const int8_t quantized_output_min = std::uniform_int_distribution(-128, 0)(rng); - const int8_t quantized_output_max = std::uniform_int_distribution(1, 127)(rng); - const float output_min = (quantized_output_min - input_zero_point) * input_scale; - const float output_max = (quantized_output_max - input_zero_point) * input_scale; - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_clamp_nc_s8(quantized_output_min, quantized_output_max, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_clamp_nc_s8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_clamp_nc_s8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ClampTestQU8, matches_operator_api) -{ - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = input_zero_point; - const float output_scale = input_scale; - const uint8_t quantized_output_min = std::uniform_int_distribution(0, 127)(rng); - const uint8_t quantized_output_max = std::uniform_int_distribution(128, 255)(rng); - const float output_min = (quantized_output_min - input_zero_point) * input_scale; - const float output_max = (quantized_output_max - input_zero_point) * input_scale; - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_clamp_nc_u8(quantized_output_min, quantized_output_max, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_clamp_nc_u8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_clamp_nc_u8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ClampTestF16, matches_operator_api) -{ - const float output_min = std::uniform_real_distribution(-128.0f, 0.0f)(rng); - const float output_max = std::uniform_real_distribution(1.0f, 127.0f)(rng); - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_clamp_nc_f16(output_min, output_max, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_clamp_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_clamp_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ClampTestF32, matches_operator_api) -{ - const float output_min = std::uniform_real_distribution(-128.0f, 0.0f)(rng); - const float output_max = std::uniform_real_distribution(1.0f, 127.0f)(rng); - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_clamp_nc_f32(output_min, output_max, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_clamp_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_clamp_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_clamp(subgraph, output_min, output_max, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/convert-nc-eager.cc b/test/convert-nc-eager.cc deleted file mode 100644 index af668afec9c4..000000000000 --- a/test/convert-nc-eager.cc +++ /dev/null @@ -1,535 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include -#include "convert-operator-tester.h" - -TEST(CONVERT_NC_F16_F32, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestRunF16toF32(); - } -} - -TEST(CONVERT_NC_F16_F32, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestRunF16toF32(); - } -} - -TEST(CONVERT_NC_F16_F32, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestRunF16toF32(); - } -} - -TEST(CONVERT_NC_F16_F32, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestRunF16toF32(); - } -} - -TEST(CONVERT_NC_F16_F32, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestRunF16toF32(); - } -} - -TEST(CONVERT_NC_F32_F16, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestRunF32toF16(); - } -} - -TEST(CONVERT_NC_F32_F16, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestRunF32toF16(); - } -} - -TEST(CONVERT_NC_F32_F16, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestRunF32toF16(); - } -} - -TEST(CONVERT_NC_F32_F16, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestRunF32toF16(); - } -} - -TEST(CONVERT_NC_F32_F16, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestRunF32toF16(); - } -} - -TEST(CONVERT_NC_F32_QS8, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestRunF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestRunF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestRunF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestRunF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestRunF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, output_scale) { - for (float output_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_scale(output_scale) - .iterations(3) - .TestRunF32toQS8(); - } - } -} - -TEST(CONVERT_NC_F32_QS8, output_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestRunF32toQS8(); - } - } -} - -TEST(CONVERT_NC_QS8_F32, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestRunQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestRunQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestRunQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestRunQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestRunQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, input_scale) { - for (float input_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_scale(input_scale) - .iterations(3) - .TestRunQS8toF32(); - } - } -} - -TEST(CONVERT_NC_QS8_F32, input_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestRunQS8toF32(); - } - } -} - -TEST(CONVERT_NC_QS16_QS8, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestRunQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestRunQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestRunQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestRunQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestRunQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, input_scale) { - for (float input_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_scale(input_scale) - .iterations(3) - .TestRunQS16toQS8(); - } - } -} - -TEST(CONVERT_NC_QS16_QS8, output_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestRunQS16toQS8(); - } - } -} - -TEST(CONVERT_NC_F32_QU8, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestRunF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestRunF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestRunF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestRunF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestRunF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, output_scale) { - for (float output_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_scale(output_scale) - .iterations(3) - .TestRunF32toQU8(); - } - } -} - -TEST(CONVERT_NC_F32_QU8, output_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestRunF32toQU8(); - } - } -} - -TEST(CONVERT_NC_QU8_F32, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestRunQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestRunQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestRunQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestRunQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestRunQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, input_scale) { - for (float input_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_scale(input_scale) - .iterations(3) - .TestRunQU8toF32(); - } - } -} - -TEST(CONVERT_NC_QU8_F32, input_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestRunQU8toF32(); - } - } -} diff --git a/test/convert-nc.cc b/test/convert-nc.cc index 33eeee1a0ca8..b796c9f0fa71 100644 --- a/test/convert-nc.cc +++ b/test/convert-nc.cc @@ -3,121 +3,314 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. +#include +#include +#include #include #include +#include #include +#include +#include #include #include -#include "convert-operator-tester.h" - -TEST(CONVERT_NC_F16_F32, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF16toF32(); +#include "xnnpack.h" +#include "xnnpack/config-types.h" +#include "xnnpack/config.h" +#include "xnnpack/internal.h" +#include "xnnpack/math.h" +#include "xnnpack/packq.h" +#include "xnnpack/buffer.h" +#include "replicable_random_device.h" + +class ConvertOperatorTester { + public: + ConvertOperatorTester& channels(size_t channels) { + assert(channels != 0); + this->channels_ = channels; + return *this; + } + + size_t channels() const { + return this->channels_; + } + + ConvertOperatorTester& input_stride(size_t input_stride) { + assert(input_stride != 0); + this->input_stride_ = input_stride; + return *this; + } + + size_t input_stride() const { + if (this->input_stride_ == 0) { + return this->channels_; + } else { + assert(this->input_stride_ >= this->channels_); + return this->input_stride_; + } } -} -TEST(CONVERT_NC_F16_F32, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestF16toF32(); + ConvertOperatorTester& output_stride(size_t output_stride) { + assert(output_stride != 0); + this->output_stride_ = output_stride; + return *this; } -} -TEST(CONVERT_NC_F16_F32, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestF16toF32(); + size_t output_stride() const { + if (this->output_stride_ == 0) { + return this->channels_; + } else { + assert(this->output_stride_ >= this->channels_); + return this->output_stride_; + } } -} -TEST(CONVERT_NC_F16_F32, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestF16toF32(); + ConvertOperatorTester& batch_size(size_t batch_size) { + assert(batch_size != 0); + this->batch_size_ = batch_size; + return *this; } -} -TEST(CONVERT_NC_F16_F32, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestF16toF32(); + size_t batch_size() const { + return this->batch_size_; } -} -TEST(CONVERT_NC_F32_F16, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF32toF16(); - } -} - -TEST(CONVERT_NC_F32_F16, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestF32toF16(); + ConvertOperatorTester& input_scale(float input_scale) { + assert(input_scale >= 0.0f); + assert(std::isnormal(input_scale)); + this->input_scale_ = input_scale; + return *this; + } + + float input_scale() const { + return this->input_scale_; + } + + ConvertOperatorTester& output_scale(float output_scale) { + assert(output_scale >= 0.0f); + assert(std::isnormal(output_scale)); + this->output_scale_ = output_scale; + return *this; + } + + float output_scale() const { + return this->output_scale_; + } + + ConvertOperatorTester& zero_point(int16_t zero_point) { + this->zero_point_ = zero_point; + return *this; + } + + int16_t zero_point() const { + return this->zero_point_; + } + + ConvertOperatorTester& iterations(size_t iterations) { + this->iterations_ = iterations; + return *this; + } + + size_t iterations() const { + return this->iterations_; + } + + void TestF16toQD8() const { + xnnpack::ReplicableRandomDevice rng; + + xnnpack::Buffer input_float((batch_size() - 1) * input_stride() + + channels()); + xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(xnn_float16) + + (batch_size() - 1) * input_stride() + + channels()); + xnnpack::Buffer output((batch_size() - 1) * output_stride() + + channels()); + xnnpack::Buffer quantization_params( + batch_size() + XNN_EXTRA_QUANTIZATION_PARAMS); + std::uniform_real_distribution range_dist(-10, 10); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + const float min_val = std::min(range_dist(rng), range_dist(rng)); + const float max_val = std::uniform_real_distribution( + min_val * + (1.0f + std::numeric_limits::max() * 6.103515625e-5f), + 10.0f)(rng); + std::uniform_real_distribution f32dist(min_val, max_val); + std::generate(input_float.begin(), input_float.end(), + [&]() { return f32dist(rng); }); + std::copy(input_float.begin(), input_float.end(), input.begin()); + std::copy(input.begin(), input.begin() + channels(), + input_float.begin()); + + // Create, setup, run, and destroy Convert operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t convert_op = nullptr; + + xnn_status status = xnn_create_convert_nc_f16_qd8(0, &convert_op); + if (status == xnn_status_unsupported_hardware) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, convert_op); + + // Smart pointer to automatically delete convert op. + std::unique_ptr + auto_convert_op(convert_op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, + xnn_reshape_convert_nc_f16_qd8( + convert_op, batch_size(), channels(), input_stride(), + output_stride(), /*threadpool=*/nullptr)); + ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f16_qd8( + convert_op, input.data(), output.data(), + quantization_params.data())); + ASSERT_EQ(xnn_status_success, + xnn_run_operator(convert_op, /*threadpool=*/nullptr)); + + // Verify results. + for (size_t i = 0; i < batch_size(); i++) { + const float* input_ptr = &input_float[i * input_stride()]; + const auto minmax = + std::minmax_element(input_ptr, input_ptr + channels()); + const float rmin = math_min_f32(0.0f, *minmax.first); + const float rmax = math_max_f32(0.0f, *minmax.second); + const float max_acceptable_error = + 0.8f * (rmax - rmin) / std::numeric_limits::max(); + for (size_t c = 0; c < channels(); c++) { + float expected = input_float[i * input_stride() + c]; + int8_t quantized_val = (int)output[i * output_stride() + c]; + float dequantized_val = + static_cast(quantized_val - + quantization_params[i].zero_point) * + quantization_params[i].scale; + ASSERT_NEAR(expected, dequantized_val, max_acceptable_error) + << "at batch " << i << " / " << batch_size() << ", channel " << c + << " / " << channels() << ", rmin=" << rmin << ", rmax=" << rmax + << ", quantization_params={zero_point=" + << quantization_params[i].zero_point + << ", scale=" << quantization_params[i].scale << "}"; + } + } + } } -} -TEST(CONVERT_NC_F32_F16, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestF32toF16(); + void TestF32toQD8() const { + xnnpack::ReplicableRandomDevice rng; + + xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + + (batch_size() - 1) * input_stride() + channels()); + xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); + xnnpack::Buffer quantization_params(batch_size() + XNN_EXTRA_QUANTIZATION_PARAMS); + std::uniform_real_distribution range_dist(-100000, 100000); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + const float first_val = range_dist(rng); + const float second_val = range_dist(rng); + std::uniform_real_distribution f32dist(std::min(first_val, second_val), std::max(first_val, second_val)); + std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); + + // Create, setup, run, and destroy Convert operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t convert_op = nullptr; + + ASSERT_EQ(xnn_status_success, + xnn_create_convert_nc_f32_qd8( + 0, &convert_op)); + ASSERT_NE(nullptr, convert_op); + + // Smart pointer to automatically delete convert op. + std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_qd8(convert_op, batch_size(), + channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); + ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_qd8(convert_op, input.data(), output.data(), quantization_params.data())); + ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); + + // Verify results. + for (size_t i = 0; i < batch_size(); i++) { + const float* input_ptr = &input[i * input_stride()]; + const auto minmax = std::minmax_element(input_ptr, input_ptr + channels()); + const float rmin = math_min_f32(0.0f, *minmax.first); + const float rmax = math_max_f32(0.0f, *minmax.second); + const float max_acceptable_error = 0.5001f * (rmax - rmin) / std::numeric_limits::max(); + for (size_t c = 0; c < channels(); c++) { + float expected = input[i * input_stride() + c]; + int8_t quantized_val = output[i * output_stride() + c]; + float dequantized_val = float(quantized_val - quantization_params[i].zero_point) * quantization_params[i].scale; + EXPECT_NEAR(expected, dequantized_val, max_acceptable_error) + << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); + } + } + } } -} -TEST(CONVERT_NC_F32_F16, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestF32toF16(); + void TestF32toQP8() const { + xnnpack::ReplicableRandomDevice rng; + + // The parameters of the GEMM config are used as packing parameters. + const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_nr2_config(); + + xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + + (batch_size() - 1) * input_stride() + channels()); + xnnpack::Buffer output(xnn_x8_packq_f32qp8_packed_size( + batch_size(), channels(), gemm_config->mr, 1 << gemm_config->log2_kr, + 1 << gemm_config->log2_sr)); + std::uniform_real_distribution range_dist(-100000, 100000); + for (size_t iteration = 0; iteration < iterations(); iteration++) { + const float first_val = range_dist(rng); + const float second_val = range_dist(rng); + std::uniform_real_distribution f32dist( + std::min(first_val, second_val), std::max(first_val, second_val)); + std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); + + // Create, setup, run, and destroy Convert operator. + ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); + xnn_operator_t convert_op = nullptr; + + ASSERT_EQ(xnn_status_success, + xnn_create_convert_nc_f32_qp8(0, &convert_op)); + ASSERT_NE(nullptr, convert_op); + + // Smart pointer to automatically delete convert op. + std::unique_ptr + auto_convert_op(convert_op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, + xnn_reshape_convert_nc_f32_qp8(convert_op, batch_size(), + channels(), input_stride(), + /*threadpool=*/nullptr)); + ASSERT_EQ(xnn_status_success, + xnn_setup_convert_nc_f32_qp8(convert_op, input.data(), + output.data())); + ASSERT_EQ(xnn_status_success, + xnn_run_operator(convert_op, /*threadpool=*/nullptr)); + + // Verify results. + for (size_t i = 0; i < batch_size(); i++) { + // const float* input_ptr = &input[i * input_stride()]; + // const auto minmax = + // std::minmax_element(input_ptr, input_ptr + channels()); + // const float rmin = math_min_f32(0.0f, *minmax.first); + // const float rmax = math_max_f32(0.0f, *minmax.second); + // const float max_acceptable_error = + // 0.5001f * (rmax - rmin) / std::numeric_limits::max(); + + // TODO(b/340399245) - Find a way to extract individual quantized values + // from the packing? + ASSERT_TRUE(true); + } + } } -} -TEST(CONVERT_NC_F32_F16, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestF32toF16(); - } -} + private: + size_t batch_size_{1}; + size_t channels_{1}; + size_t input_stride_{0}; + size_t output_stride_{0}; + float input_scale_{150.0f}; + float output_scale_{3.0f}; + int16_t zero_point_{1}; + size_t iterations_{15}; +}; TEST(CONVERT_NC_F16_QD8, unit_batch) { for (size_t channels = 1; channels < 100; channels++) { @@ -227,504 +420,6 @@ TEST(CONVERT_NC_F32_QD8, small_batch_with_input_and_output_stride) { } } -TEST(CONVERT_NC_F32_QS8, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestF32toQS8(); - } -} - -TEST(CONVERT_NC_F32_QS8, output_scale) { - for (float output_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_scale(output_scale) - .iterations(3) - .TestF32toQS8(); - } - } -} - -TEST(CONVERT_NC_F32_QS8, output_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestF32toQS8(); - } - } -} - -TEST(CONVERT_NC_F32_QU8, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestF32toQU8(); - } -} - -TEST(CONVERT_NC_F32_QU8, output_scale) { - for (float output_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_scale(output_scale) - .iterations(3) - .TestF32toQU8(); - } - } -} - -TEST(CONVERT_NC_F32_QU8, output_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestF32toQU8(); - } - } -} - -TEST(CONVERT_NC_QS8_F16, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQS8toF16(); - } -} - -TEST(CONVERT_NC_QS8_F16, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestQS8toF16(); - } -} - -TEST(CONVERT_NC_QS8_F16, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestQS8toF16(); - } -} - -TEST(CONVERT_NC_QS8_F16, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestQS8toF16(); - } -} - -TEST(CONVERT_NC_QS8_F16, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestQS8toF16(); - } -} - -TEST(CONVERT_NC_QS8_F16, input_scale) { - for (float input_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_scale(input_scale) - .iterations(3) - .TestQS8toF16(); - } - } -} - -TEST(CONVERT_NC_QS8_F16, input_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestQS8toF16(); - } - } -} - -TEST(CONVERT_NC_QS8_F32, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestQS8toF32(); - } -} - -TEST(CONVERT_NC_QS8_F32, input_scale) { - for (float input_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_scale(input_scale) - .iterations(3) - .TestQS8toF32(); - } - } -} - -TEST(CONVERT_NC_QS8_F32, input_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestQS8toF32(); - } - } -} - -TEST(CONVERT_NC_QS16_QS8, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestQS16toQS8(); - } -} - -TEST(CONVERT_NC_QS16_QS8, input_scale) { - for (float input_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_scale(input_scale) - .iterations(3) - .TestQS16toQS8(); - } - } -} - -TEST(CONVERT_NC_QS16_QS8, output_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestQS16toQS8(); - } - } -} - -TEST(CONVERT_NC_QU8_F32, unit_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(1) - .channels(channels) - .iterations(3) - .TestQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .iterations(3) - .TestQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch_with_input_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .iterations(3) - .TestQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch_with_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .output_stride(117) - .iterations(3) - .TestQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, small_batch_with_input_and_output_stride) { - for (size_t channels = 1; channels < 100; channels += 15) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(3) - .TestQU8toF32(); - } -} - -TEST(CONVERT_NC_QU8_F32, input_scale) { - for (float input_scale : {0.1f, 1.0f, 10.0f}) { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .input_scale(input_scale) - .iterations(3) - .TestQU8toF32(); - } - } -} - -TEST(CONVERT_NC_QU8_F32, input_zero_point) { - for (int16_t zero_point = std::numeric_limits::min(); - zero_point <= std::numeric_limits::max(); - zero_point += 51) - { - for (size_t channels = 1; channels < 100; channels++) { - ConvertOperatorTester() - .batch_size(3) - .channels(channels) - .zero_point(zero_point) - .iterations(3) - .TestQU8toF32(); - } - } -} - TEST(CONVERT_NC_F32_QP8, unit_batch) { for (size_t channels = 1; channels < 100; channels++) { ConvertOperatorTester() diff --git a/test/convert-operator-tester.h b/test/convert-operator-tester.h index e88eea376bfc..4bbb9dc1c397 100644 --- a/test/convert-operator-tester.h +++ b/test/convert-operator-tester.h @@ -26,994 +26,3 @@ #include "xnnpack/buffer.h" #include "replicable_random_device.h" -class ConvertOperatorTester { - public: - ConvertOperatorTester& channels(size_t channels) { - assert(channels != 0); - this->channels_ = channels; - return *this; - } - - size_t channels() const { - return this->channels_; - } - - ConvertOperatorTester& input_stride(size_t input_stride) { - assert(input_stride != 0); - this->input_stride_ = input_stride; - return *this; - } - - size_t input_stride() const { - if (this->input_stride_ == 0) { - return this->channels_; - } else { - assert(this->input_stride_ >= this->channels_); - return this->input_stride_; - } - } - - ConvertOperatorTester& output_stride(size_t output_stride) { - assert(output_stride != 0); - this->output_stride_ = output_stride; - return *this; - } - - size_t output_stride() const { - if (this->output_stride_ == 0) { - return this->channels_; - } else { - assert(this->output_stride_ >= this->channels_); - return this->output_stride_; - } - } - - ConvertOperatorTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - this->batch_size_ = batch_size; - return *this; - } - - size_t batch_size() const { - return this->batch_size_; - } - - ConvertOperatorTester& input_scale(float input_scale) { - assert(input_scale >= 0.0f); - assert(std::isnormal(input_scale)); - this->input_scale_ = input_scale; - return *this; - } - - float input_scale() const { - return this->input_scale_; - } - - ConvertOperatorTester& output_scale(float output_scale) { - assert(output_scale >= 0.0f); - assert(std::isnormal(output_scale)); - this->output_scale_ = output_scale; - return *this; - } - - float output_scale() const { - return this->output_scale_; - } - - ConvertOperatorTester& zero_point(int16_t zero_point) { - this->zero_point_ = zero_point; - return *this; - } - - int16_t zero_point() const { - return this->zero_point_; - } - - ConvertOperatorTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void TestF16toF32() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = input[i * input_stride() + c]; - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_f16_f32( - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f16_f32(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f16_f32(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestF32toF16() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = input[i * input_stride() + c]; - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_f32_f16( - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_f16(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_f16(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestF16toQD8() const { - xnnpack::ReplicableRandomDevice rng; - - xnnpack::Buffer input_float((batch_size() - 1) * input_stride() + - channels()); - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - (batch_size() - 1) * input_stride() + - channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + - channels()); - xnnpack::Buffer quantization_params( - batch_size() + XNN_EXTRA_QUANTIZATION_PARAMS); - std::uniform_real_distribution range_dist(-10, 10); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - const float min_val = std::min(range_dist(rng), range_dist(rng)); - const float max_val = std::uniform_real_distribution( - min_val * - (1.0f + std::numeric_limits::max() * 6.103515625e-5f), - 10.0f)(rng); - std::uniform_real_distribution f32dist(min_val, max_val); - std::generate(input_float.begin(), input_float.end(), - [&]() { return f32dist(rng); }); - std::copy(input_float.begin(), input_float.end(), input.begin()); - std::copy(input.begin(), input.begin() + channels(), - input_float.begin()); - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - xnn_status status = xnn_create_convert_nc_f16_qd8(0, &convert_op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr - auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - xnn_reshape_convert_nc_f16_qd8( - convert_op, batch_size(), channels(), input_stride(), - output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f16_qd8( - convert_op, input.data(), output.data(), - quantization_params.data())); - ASSERT_EQ(xnn_status_success, - xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - const float* input_ptr = &input_float[i * input_stride()]; - const auto minmax = - std::minmax_element(input_ptr, input_ptr + channels()); - const float rmin = math_min_f32(0.0f, *minmax.first); - const float rmax = math_max_f32(0.0f, *minmax.second); - const float max_acceptable_error = - 0.8f * (rmax - rmin) / std::numeric_limits::max(); - for (size_t c = 0; c < channels(); c++) { - float expected = input_float[i * input_stride() + c]; - int8_t quantized_val = (int)output[i * output_stride() + c]; - float dequantized_val = - static_cast(quantized_val - - quantization_params[i].zero_point) * - quantization_params[i].scale; - ASSERT_NEAR(expected, dequantized_val, max_acceptable_error) - << "at batch " << i << " / " << batch_size() << ", channel " << c - << " / " << channels() << ", rmin=" << rmin << ", rmax=" << rmax - << ", quantization_params={zero_point=" - << quantization_params[i].zero_point - << ", scale=" << quantization_params[i].scale << "}"; - } - } - } - } - - void TestF32toQD8() const { - xnnpack::ReplicableRandomDevice rng; - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer quantization_params(batch_size() + XNN_EXTRA_QUANTIZATION_PARAMS); - std::uniform_real_distribution range_dist(-100000, 100000); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - const float first_val = range_dist(rng); - const float second_val = range_dist(rng); - std::uniform_real_distribution f32dist(std::min(first_val, second_val), std::max(first_val, second_val)); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_f32_qd8( - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_qd8(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_qd8(convert_op, input.data(), output.data(), quantization_params.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - const float* input_ptr = &input[i * input_stride()]; - const auto minmax = std::minmax_element(input_ptr, input_ptr + channels()); - const float rmin = math_min_f32(0.0f, *minmax.first); - const float rmax = math_max_f32(0.0f, *minmax.second); - const float max_acceptable_error = 0.5001f * (rmax - rmin) / std::numeric_limits::max(); - for (size_t c = 0; c < channels(); c++) { - float expected = input[i * input_stride() + c]; - int8_t quantized_val = output[i * output_stride() + c]; - float dequantized_val = float(quantized_val - quantization_params[i].zero_point) * quantization_params[i].scale; - EXPECT_NEAR(expected, dequantized_val, max_acceptable_error) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestF32toQP8() const { - xnnpack::ReplicableRandomDevice rng; - - // The parameters of the GEMM config are used as packing parameters. - const struct xnn_gemm_config* gemm_config = xnn_init_f32_gemm_nr2_config(); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output(xnn_x8_packq_f32qp8_packed_size( - batch_size(), channels(), gemm_config->mr, 1 << gemm_config->log2_kr, - 1 << gemm_config->log2_sr)); - std::uniform_real_distribution range_dist(-100000, 100000); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - const float first_val = range_dist(rng); - const float second_val = range_dist(rng); - std::uniform_real_distribution f32dist( - std::min(first_val, second_val), std::max(first_val, second_val)); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_f32_qp8(0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr - auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - xnn_reshape_convert_nc_f32_qp8(convert_op, batch_size(), - channels(), input_stride(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_convert_nc_f32_qp8(convert_op, input.data(), - output.data())); - ASSERT_EQ(xnn_status_success, - xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - // const float* input_ptr = &input[i * input_stride()]; - // const auto minmax = - // std::minmax_element(input_ptr, input_ptr + channels()); - // const float rmin = math_min_f32(0.0f, *minmax.first); - // const float rmax = math_max_f32(0.0f, *minmax.second); - // const float max_acceptable_error = - // 0.5001f * (rmax - rmin) / std::numeric_limits::max(); - - // TODO(b/340399245) - Find a way to extract individual quantized values - // from the packing? - ASSERT_TRUE(true); - } - } - } - - void TestF32toQS8() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - const float inv_scale = 1.0f / output_scale(); - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); - scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); - output_ref[i * channels() + c] = int8_t(std::lrintf(scaled_input) + long(zero_point())); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_f32_qs8( - output_scale(), int8_t(zero_point()), - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_qs8(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_qs8(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(int32_t(output_ref[i * channels() + c]), int32_t(output[i * output_stride() + c])) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestF32toQU8() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - const float inv_scale = 1.0f / output_scale(); - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); - scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); - output_ref[i * channels() + c] = uint8_t(std::lrintf(scaled_input) + long(zero_point())); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_f32_qu8( - output_scale(), uint8_t(zero_point()), - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_qu8(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_qu8(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(uint32_t(output_ref[i * channels() + c]), uint32_t(output[i * output_stride() + c])) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestQS8toF16() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(int8_t) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - const float fp16_scale = xnn_float16(input_scale()); - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = xnn_float16(float(input[i * input_stride() + c] - zero_point()) * fp16_scale); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - xnn_status status = xnn_create_convert_nc_qs8_f16( - input_scale(), int8_t(zero_point()), - 0, &convert_op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qs8_f16(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qs8_f16(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float tolerance = std::max(output_ref[i * channels() + c] * 1e-2, 1e-4); - EXPECT_NEAR(output_ref[i * channels() + c], output[i * output_stride() + c], tolerance) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestQS8toF32() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(int8_t) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = float(input[i * input_stride() + c] - zero_point()) * input_scale(); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_qs8_f32( - input_scale(), int8_t(zero_point()), - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qs8_f32(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qs8_f32(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestQS16toQS8() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution qs16dist; - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(int16_t) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return qs16dist(rng); }); - - // Compute reference results. - const int64_t multiplier = static_cast (std::llrintf(32768.0f * input_scale())); - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const int64_t input_value = input[i * input_stride() + c]; - int32_t output_value = static_cast(static_cast(input_value * multiplier + UINT64_C(0x4000)) >> 15) + zero_point(); - output_value = std::min(output_value, std::numeric_limits::max()); - output_value = std::max(output_value, std::numeric_limits::min()); - output_ref[i * channels() + c] = static_cast(output_value); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_qs16_qs8( - input_scale(), 1.0f, int8_t(zero_point()), - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qs16_qs8(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qs16_qs8(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(int32_t(output_ref[i * channels() + c]), int32_t(output[i * output_stride() + c])) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestQU8toF32() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(uint8_t) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = float(input[i * input_stride() + c] - zero_point()) * input_scale(); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t convert_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_convert_nc_qu8_f32( - input_scale(), uint8_t(zero_point()), - 0, &convert_op)); - ASSERT_NE(nullptr, convert_op); - - // Smart pointer to automatically delete convert op. - std::unique_ptr auto_convert_op(convert_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qu8_f32(convert_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qu8_f32(convert_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(convert_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestRunF16toF32() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = input[i * input_stride() + c]; - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - - ASSERT_EQ(xnn_status_success, - xnn_run_convert_nc_f16_f32( - channels(), - input_stride(), output_stride(), - batch_size(), - input.data(), output.data(), - 0, - /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestRunF32toF16() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = input[i * input_stride() + c]; - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - - ASSERT_EQ(xnn_status_success, - xnn_run_convert_nc_f32_f16( - channels(), - input_stride(), output_stride(), - batch_size(), - input.data(), output.data(), - 0, - /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestRunF32toQS8() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - const float inv_scale = 1.0f / output_scale(); - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); - scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); - output_ref[i * channels() + c] = int8_t(std::lrintf(scaled_input) + long(zero_point())); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - ASSERT_EQ(xnn_status_success, - xnn_run_convert_nc_f32_qs8( - channels(), input_stride(), output_stride(),batch_size(), - input.data(), output.data(), - output_scale(), int8_t(zero_point()), - 0, - /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(int32_t(output_ref[i * channels() + c]), int32_t(output[i * output_stride() + c])) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestRunQS8toF32() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(int8_t) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = float(input[i * input_stride() + c] - zero_point()) * input_scale(); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - - ASSERT_EQ(xnn_status_success, - xnn_run_convert_nc_qs8_f32( - channels(), input_stride(), output_stride(), batch_size(), - input.data(), output.data(), - input_scale(), int8_t(zero_point()), - 0, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestRunQS16toQS8() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution qs16dist; - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(int16_t) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return qs16dist(rng); }); - - // Compute reference results. - const int64_t multiplier = static_cast (std::llrintf(32768.0f * input_scale())); - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const int64_t input_value = input[i * input_stride() + c]; - int32_t output_value = static_cast(static_cast(input_value * multiplier + UINT64_C(0x4000)) >> 15) + zero_point(); - output_value = std::min(output_value, std::numeric_limits::max()); - output_value = std::max(output_value, std::numeric_limits::min()); - output_ref[i * channels() + c] = static_cast(output_value); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - - ASSERT_EQ(xnn_status_success, - xnn_run_convert_nc_qs16_qs8( - channels(), input_stride(), output_stride(), batch_size(), - input.data(), output.data(), - input_scale(), 1.0f, int8_t(zero_point()), - 0, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(int32_t(output_ref[i * channels() + c]), int32_t(output[i * output_stride() + c])) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestRunF32toQU8() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - const float inv_scale = 1.0f / output_scale(); - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - float scaled_input = input[i * input_stride() + c] * inv_scale; - scaled_input = std::min(scaled_input, float(std::numeric_limits::max() - zero_point())); - scaled_input = std::max(scaled_input, float(std::numeric_limits::min() - zero_point())); - output_ref[i * channels() + c] = uint8_t(std::lrintf(scaled_input) + long(zero_point())); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - - ASSERT_EQ(xnn_status_success, - xnn_run_convert_nc_f32_qu8( - channels(), input_stride(), output_stride(), - batch_size(), input.data(), output.data(), - output_scale(), uint8_t(zero_point()), - 0, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(uint32_t(output_ref[i * channels() + c]), uint32_t(output[i * output_stride() + c])) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - void TestRunQU8toF32() const { - ASSERT_GE(zero_point(), std::numeric_limits::min()); - ASSERT_LE(zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(uint8_t) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = float(input[i * input_stride() + c] - zero_point()) * input_scale(); - } - } - - // Create, setup, run, and destroy Convert operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - ASSERT_EQ(xnn_status_success, - xnn_run_convert_nc_qu8_f32( - channels(), input_stride(), output_stride(), - batch_size(), input.data(), output.data(), - input_scale(), uint8_t(zero_point()), - 0, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c]) - << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels(); - } - } - } - } - - private: - size_t batch_size_{1}; - size_t channels_{1}; - size_t input_stride_{0}; - size_t output_stride_{0}; - float input_scale_{150.0f}; - float output_scale_{3.0f}; - int16_t zero_point_{1}; - size_t iterations_{15}; -}; diff --git a/test/convert.cc b/test/convert.cc deleted file mode 100644 index 931f18d5f51b..000000000000 --- a/test/convert.cc +++ /dev/null @@ -1,945 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using ConvertTestF32ToF16 = UnaryTest; -using ConvertTestF32ToQD8 = UnaryTest; -using ConvertTestF32ToQS8 = UnaryTest; -using ConvertTestF32ToQU8 = UnaryTest; - -using ConvertTestQS8ToQS8 = UnaryTest; -using ConvertTestQS8ToF16 = UnaryTest; -using ConvertTestQS8ToF32 = UnaryTest; - -using ConvertTestQU8ToQU8 = UnaryTest; -using ConvertTestQU8ToF32 = UnaryTest; - -using ConvertTestF16ToF32 = UnaryTest; -using ConvertTestF16ToQD8 = UnaryTest; - -TEST_F(ConvertTestF16ToF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16_to_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestF32ToF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32_to_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestF32ToQS8, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, signed_zero_point, scale, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32_to_qs8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestF32ToQU8, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, unsigned_zero_point, scale, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32_to_qu8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestQS8ToF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, signed_zero_point, scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8_to_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestQS8ToF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, signed_zero_point, scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8_to_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestQS8ToQS8, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - const int32_t input_zero_point = i8dist(rng); - const int32_t output_zero_point = i8dist(rng); - // Scale distributions chosen to guarantee 2**-8 <= input_scale / output_scale <= 2**7 - const float input_scale = std::uniform_real_distribution(0.0883883f, 11.3137f)(rng); - const float output_scale = std::uniform_real_distribution(0.0883883f, 11.3137f)(rng); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestQU8ToF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, unsigned_zero_point, scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_qu8_to_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestQU8ToQU8, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - const int32_t input_zero_point = u8dist(rng); - const int32_t output_zero_point = u8dist(rng); - // Scale distributions chosen to guarantee 2**-8 <= input_scale / output_scale <= 2**7 - const float input_scale = std::uniform_real_distribution(0.0883883f, 11.3137f)(rng); - const float output_scale = std::uniform_real_distribution(0.0883883f, 11.3137f)(rng); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_qu8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestF16ToF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_convert_nc_f16_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f16_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f16_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestF32ToF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_convert_nc_f32_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestF32ToQS8, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_convert_nc_f32_qs8( - scale, signed_zero_point, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_qs8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_qs8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, signed_zero_point, scale, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestQS8ToF16, matches_operator_api) -{ - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_convert_nc_qs8_f16(scale, signed_zero_point, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qs8_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qs8_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, signed_zero_point, scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestF32ToQU8, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_convert_nc_f32_qu8( - scale, unsigned_zero_point, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_f32_qu8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_f32_qu8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, unsigned_zero_point, scale, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestQS8ToF32, matches_operator_api) -{ - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_convert_nc_qs8_f32(scale, signed_zero_point, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qs8_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qs8_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, signed_zero_point, scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestQS8ToQS8, matches_operator_api) -{ - const int8_t input_zero_point = i8dist(rng); - const int8_t output_zero_point = i8dist(rng); - const float input_scale = std::uniform_real_distribution(0.25f, 4.0f)(rng); - const float output_scale = std::uniform_real_distribution(0.25f, 4.0f)(rng); - - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_convert_nc_qs8(input_scale, input_zero_point, output_scale, output_zero_point, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qs8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qs8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, - dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, - dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - const std::array external = { - xnn_external_value{input_id, input.data()}, - xnn_external_value{output_id, subgraph_output.data()} - }; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestQU8ToF32, matches_operator_api) -{ - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_convert_nc_qu8_f32(scale, unsigned_zero_point, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qu8_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qu8_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, unsigned_zero_point, scale, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestQU8ToQU8, matches_operator_api) -{ - const uint8_t input_zero_point = u8dist(rng); - const uint8_t output_zero_point = u8dist(rng); - const float input_scale = std::uniform_real_distribution(0.25f, 4.0f)(rng); - const float output_scale = std::uniform_real_distribution(0.25f, 4.0f)(rng); - - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_convert_nc_qu8(input_scale, input_zero_point, output_scale, output_zero_point, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_convert_nc_qu8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_convert_nc_qu8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, - dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, - dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - const std::array external = { - xnn_external_value{input_id, input.data()}, - xnn_external_value{output_id, subgraph_output.data()} - }; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ConvertTestF16ToQD8, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_dynamically_quantized_tensor_value( - subgraph, xnn_datatype_qdint8, dims.size(), /*num_nonbatch_dims=*/1, dims.data(), - /*external_id=*/1, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16_to_qd8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ConvertTestF32ToQD8, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_dynamically_quantized_tensor_value( - subgraph, xnn_datatype_qdint8, dims.size(), /*num_nonbatch_dims=*/1, dims.data(), - /*external_id=*/1, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_convert); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32_to_qd8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} diff --git a/test/convolution-2d.cc b/test/convolution-2d.cc index c0d525315807..439d887088df 100644 --- a/test/convolution-2d.cc +++ b/test/convolution-2d.cc @@ -377,7 +377,7 @@ TEST_F(ConvolutionTestQD8F16QC8W, internally_allocated_dynamic_quantization_para ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_convolution_2d( subgraph, input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, kernel_height, @@ -549,7 +549,7 @@ TEST_F(ConvolutionTestQD8F32QC8W, internally_allocated_dynamic_quantization_para ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_convolution_2d( subgraph, input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, kernel_height, diff --git a/test/deconvolution-2d.cc b/test/deconvolution-2d.cc index 01c80f733bd0..07ac07ab25ad 100644 --- a/test/deconvolution-2d.cc +++ b/test/deconvolution-2d.cc @@ -989,7 +989,7 @@ TEST_F(DeconvolutionTestQD8F32QC8W, internally_allocated_dynamic_quantization_pa subgraph, xnn_datatype_fp32, output_dims.size(), output_dims.data(), nullptr, /*external_id=*/3, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ( xnn_status_success, xnn_define_deconvolution_2d( diff --git a/test/elu-nc.cc b/test/elu-nc.cc deleted file mode 100644 index 092b131706ce..000000000000 --- a/test/elu-nc.cc +++ /dev/null @@ -1,156 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "unary-operator-tester.h" -#include "pthreadpool.h" - -namespace xnnpack { - -class LeakyReLUOperatorTester : public UnaryOperatorTester { - public: - LeakyReLUOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-20.0f, 20.0f}; - range_f16_ = {-25.0f, 25.0f}; - } - - inline LeakyReLUOperatorTester& alpha(float alpha) { - assert(alpha > 0.0f); - assert(alpha < 1.0f); - this->alpha_ = alpha; - return *this; - } - - inline float alpha() const { return this->alpha_; } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { - return std::signbit(x) ? std::expm1(x) * alpha() : x; - } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float) const override { return 5e-6f; } - float AbsTolF16(float y_ref) const override { - return std::max(1.0e-4f, std::abs(y_ref) * 5.0e-3f); - } - float AbsTolQS8(float) const override { return 0.6f; }; - float AbsTolQU8(float) const override { return 0.6f; }; - - xnn_status CreateOpF32(uint32_t flags, - xnn_operator_t* op_out) const override { - return xnn_create_elu_nc_f32(alpha(), flags, op_out); - } - CREATE_OP_RESHAPE_OVERRIDE_F32(elu); - CREATE_OP_SETUP_OVERRIDE_F32(elu); - xnn_status RunOpF32(size_t channels, size_t input_stride, - size_t output_stride, size_t batch_size, - const float* input, float* output, uint32_t flags, - pthreadpool_t threadpool) const override { - return xnn_run_elu_nc_f32(channels, input_stride, output_stride, batch_size, - input, output, alpha(), flags, threadpool); - } - - xnn_status CreateOpF16(uint32_t flags, - xnn_operator_t* op_out) const override { - return xnn_create_elu_nc_f16(alpha(), flags, op_out); - } - CREATE_OP_RESHAPE_OVERRIDE_F16(elu); - CREATE_OP_SETUP_OVERRIDE_F16(elu); - - xnn_status CreateOpQS8(int8_t input_zero_point, float input_scale, - int8_t output_zero_point, float output_scale, - int8_t output_min, int8_t output_max, uint32_t flags, - xnn_operator_t* op_out) const override { - return xnn_create_elu_nc_qs8(alpha(), input_zero_point, input_scale, - output_zero_point, output_scale, output_min, - output_max, flags, op_out); - } - CREATE_OP_RESHAPE_OVERRIDE_QS8(elu); - CREATE_OP_SETUP_OVERRIDE_QS8(elu); - - private: - float alpha_ = 0.5f; -}; - -CREATE_UNARY_FLOAT_TESTS(F32, LeakyReLUOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, LeakyReLUOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, LeakyReLUOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -CREATE_UNARY_QUANTIZED_TESTS(QS8, LeakyReLUOperatorTester); - -#ifndef XNN_EXCLUDE_F16_TESTS -TEST(ELU_NC_F16, small_batch_with_alpha) { - for (size_t batch_size = 1; batch_size <= 3; batch_size += 2) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float alpha = 1.0e-4f; alpha < 1.0f; alpha *= 3.14159265f) { - LeakyReLUOperatorTester() - .alpha(alpha) - .batch_size(3) - .channels(channels) - .iterations(1) - .TestF16(); - } - } - } -} -#endif // XNN_EXCLUDE_F16_TESTS - -TEST(ELU_NC_F32, small_batch_with_alpha) { - for (size_t batch_size = 1; batch_size <= 3; batch_size += 2) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float alpha = 1.0e-4f; alpha < 1.0f; alpha *= 3.14159265f) { - LeakyReLUOperatorTester() - .alpha(alpha) - .batch_size(3) - .channels(channels) - .iterations(1) - .TestF32(); - } - } - } -} - -TEST(ELU_NC_QS8, small_batch_with_alpha) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float alpha = 1.0e-4f; alpha < 1.0f; alpha *= 3.14159265f) { - LeakyReLUOperatorTester() - .alpha(alpha) - .batch_size(3) - .channels(channels) - .iterations(1) - .TestQS8(); - } - } -} - -TEST(ELU_NC_QS8, strided_batch_with_alpha) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float alpha = 1.0e-4f; alpha < 1.0f; alpha *= 3.14159265f) { - LeakyReLUOperatorTester() - .alpha(alpha) - .batch_size(3) - .channels(channels) - .input_stride(129) - .output_stride(117) - .iterations(1) - .TestQS8(); - } - } -} -}; // namespace xnnpack diff --git a/test/elu.cc b/test/elu.cc deleted file mode 100644 index 39866bfa7da0..000000000000 --- a/test/elu.cc +++ /dev/null @@ -1,309 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using EluTestQS8 = UnaryTest; -using EluTestF16 = UnaryTest; -using EluTestF32 = UnaryTest; - -TEST_F(EluTestQS8, define) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = input_zero_point; - const float output_scale = input_scale; - const float alpha = std::uniform_real_distribution(1.0e-4f, 1.0f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_elu(subgraph, alpha, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_elu); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8); - ASSERT_EQ(node->params.elu.alpha, alpha); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(EluTestF16, define) -{ - const float alpha = std::uniform_real_distribution(1.0e-4f, 1.0f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_elu(subgraph, alpha, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_elu); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->params.elu.alpha, alpha); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(EluTestF32, define) -{ - const float alpha = std::uniform_real_distribution(1.0e-4f, 1.0f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_elu(subgraph, alpha, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_elu); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->params.elu.alpha, alpha); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(EluTestQS8, matches_operator_api) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = input_zero_point; - const float output_scale = input_scale; - const float alpha = std::uniform_real_distribution(1.0e-4f, 1.0f)(rng); - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_elu_nc_qs8( - alpha, input_zero_point, input_scale, output_zero_point, output_scale, INT8_MIN, - INT8_MAX, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_elu_nc_qs8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_elu_nc_qs8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_elu(subgraph, alpha, input_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(EluTestF16, matches_operator_api) -{ - const float alpha = std::uniform_real_distribution(1.0e-4f, 1.0f)(rng); - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_elu_nc_f16(alpha, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_elu_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_elu_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_elu(subgraph, alpha, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(EluTestF32, matches_operator_api) -{ - const float alpha = std::uniform_real_distribution(1.0e-4f, 1.0f)(rng); - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_elu_nc_f32(alpha, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_elu_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_elu_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_elu(subgraph, alpha, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/exp-nc.cc b/test/exp-nc.cc deleted file mode 100644 index 44d87e07fc95..000000000000 --- a/test/exp-nc.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class ExpOperatorTester : public UnaryOperatorTester { - public: - ExpOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-10.f, 10.0f}; - range_f16_ = {-10.f, 10.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::exp(x); } - - CREATE_STANDARD_OP_OVERRIDES_F32(exp); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, ExpOperatorTester); - -}; // namespace xnnpack diff --git a/test/exp.cc b/test/exp.cc deleted file mode 100644 index 65ed3e58f583..000000000000 --- a/test/exp.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using ExpTestF32 = UnaryTest; - -TEST_F(ExpTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_exp(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_exp); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ExpTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(0.f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_exp_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_exp_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_exp_nc_f32(op, input.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_exp(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/f16-f32-vcvt.cc b/test/f16-f32-vcvt.cc index d723077bf52e..42a15c911405 100644 --- a/test/f16-f32-vcvt.cc +++ b/test/f16-f32-vcvt.cc @@ -2,21 +2,17 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-f32-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } #include "f16-f32-vcvt/f16-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f16-qs8-vcvt.cc b/test/f16-qs8-vcvt.cc index 89e963b8d479..5853699388af 100644 --- a/test/f16-qs8-vcvt.cc +++ b/test/f16-qs8-vcvt.cc @@ -2,26 +2,21 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f16-qs8-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ - \ -XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_saturation) { TestOutputSaturation(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_overflow) { TestOutputOverflow(arch_flags, batch_tile, ukernel, init_params); } #include "f16-qs8-vcvt/f16-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f16-vabs.cc b/test/f16-vabs.cc index bd9953a00ddd..0a1bb3185e70 100644 --- a/test/f16-vabs.cc +++ b/test/f16-vabs.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); +using TestInfo = Abs; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vabs/f16-vabs.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vclamp.cc b/test/f16-vclamp.cc index db5fd14048aa..d170aa432f03 100644 --- a/test/f16-vclamp.cc +++ b/test/f16-vclamp.cc @@ -24,15 +24,50 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = Clamp; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, clamp_min) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t min = 1; min < 255; min = xnnpack::NextPrime(min)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = min; \ + params.clamp.max = 255; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} \ + \ +TEST(ukernel, clamp_max) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t max = 1; max < 255; max = xnnpack::NextPrime(max)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = 0; \ + params.clamp.max = max; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} #include "f16-vclamp/f16-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-velu.cc b/test/f16-velu.cc index d1ddb4f7ce10..21eacb6db6a1 100644 --- a/test/f16-velu.cc +++ b/test/f16-velu.cc @@ -24,57 +24,28 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, prescale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float prescale : std::array({0.1f, 10.0f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .prescale(prescale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, alpha) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float alpha : std::array({0.3f, 3.0f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .alpha(alpha) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, beta) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float beta : std::array({0.3f, 3.0f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .beta(beta) \ - .Test(ukernel, init_params); \ - } \ - } \ +using TestInfo = ELU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, alpha) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = std::max(1, batch_tile - 1); \ + for (float alpha : std::array({0.3f, 3.0f})) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ + xnn_unary_params params; \ + params.elu.alpha = alpha; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ } #include "f16-velu/f16-velu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vhswish.cc b/test/f16-vhswish.cc index f8b4a3b5433b..c2d35fed0665 100644 --- a/test/f16-vhswish.cc +++ b/test/f16-vhswish.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = HardSwish; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vhswish/f16-vhswish.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vlrelu.cc b/test/f16-vlrelu.cc index eea1518a1e62..0260b75c1910 100644 --- a/test/f16-vlrelu.cc +++ b/test/f16-vlrelu.cc @@ -24,27 +24,28 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, slope) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float slope : std::array({-0.7f, 0.3f, 1.3f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .slope(slope) \ - .Test(ukernel, init_params); \ - } \ - } \ +using TestInfo = LeakyReLU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, negative_slope) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = std::max(1, batch_tile - 1); \ + for (float negative_slope : std::array({0.01f, 0.3f, 1.3f})) { \ + xnn_unary_params params; \ + params.leaky_relu.negative_slope = negative_slope; \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ } #include "f16-vlrelu/f16-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vneg.cc b/test/f16-vneg.cc index 5cf6e86eda87..aecb281d2dce 100644 --- a/test/f16-vneg.cc +++ b/test/f16-vneg.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); +using TestInfo = Negate; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vneg/f16-vneg.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndd.cc b/test/f16-vrndd.cc index 8690fffb9121..5a2f73ca1ad3 100644 --- a/test/f16-vrndd.cc +++ b/test/f16-vrndd.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); +using TestInfo = RoundDown; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vrnd/f16-vrndd.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndne.cc b/test/f16-vrndne.cc index d11342514cba..9426e9ec3d21 100644 --- a/test/f16-vrndne.cc +++ b/test/f16-vrndne.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); +using TestInfo = RoundToNearestEven; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vrnd/f16-vrndne.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndu.cc b/test/f16-vrndu.cc index 44f229fff034..16aa270e71c8 100644 --- a/test/f16-vrndu.cc +++ b/test/f16-vrndu.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); +using TestInfo = RoundUp; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vrnd/f16-vrndu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrndz.cc b/test/f16-vrndz.cc index 5cc018105775..ae0250aca019 100644 --- a/test/f16-vrndz.cc +++ b/test/f16-vrndz.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); +using TestInfo = RoundTowardsZero; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vrnd/f16-vrndz.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vrsqrt.cc b/test/f16-vrsqrt.cc index ffe76b20cff7..4cfc63dd5865 100644 --- a/test/f16-vrsqrt.cc +++ b/test/f16-vrsqrt.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = ReciprocalSquareRoot; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vrsqrt/f16-vrsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsigmoid.cc b/test/f16-vsigmoid.cc index 661f486e0333..b411ece8fcf7 100644 --- a/test/f16-vsigmoid.cc +++ b/test/f16-vsigmoid.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = Sigmoid; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vsigmoid/f16-vsigmoid.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsqr.cc b/test/f16-vsqr.cc index 97dedacf9450..28d2714c9ece 100644 --- a/test/f16-vsqr.cc +++ b/test/f16-vsqr.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); +using TestInfo = Square; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vsqr/f16-vsqr.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vsqrt.cc b/test/f16-vsqrt.cc index 66d396ad7fe8..f051e690fcb6 100644 --- a/test/f16-vsqrt.cc +++ b/test/f16-vsqrt.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = SquareRoot; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vsqrt/f16-vsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f16-vtanh.cc b/test/f16-vtanh.cc index 924acb8d415a..6dae854a1852 100644 --- a/test/f16-vtanh.cc +++ b/test/f16-vtanh.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = TanH; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f16-vtanh/f16-vtanh.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-f16-vcvt.cc b/test/f32-f16-vcvt.cc index 6ca121fd4527..bcb7cf28161c 100644 --- a/test/f32-f16-vcvt.cc +++ b/test/f32-f16-vcvt.cc @@ -2,21 +2,17 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: f32-f16-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } #include "f32-f16-vcvt/f32-f16-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f32-qs8-vcvt.cc b/test/f32-qs8-vcvt.cc index ca31e0fa0f46..73e4a566e7aa 100644 --- a/test/f32-qs8-vcvt.cc +++ b/test/f32-qs8-vcvt.cc @@ -6,21 +6,17 @@ #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ - \ -XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ - \ -XNN_TEST_CVT_SATURATION(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_saturation) { TestOutputSaturation(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_overflow) { TestOutputOverflow(arch_flags, batch_tile, ukernel, init_params); } #include "f32-qs8-vcvt/f32-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f32-qu8-vcvt.cc b/test/f32-qu8-vcvt.cc index 0189a95982b7..799894aa806a 100644 --- a/test/f32-qu8-vcvt.cc +++ b/test/f32-qu8-vcvt.cc @@ -6,21 +6,17 @@ #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ - \ -XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ - \ -XNN_TEST_CVT_SATURATION(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_saturation) { TestOutputSaturation(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_overflow) { TestOutputOverflow(arch_flags, batch_tile, ukernel, init_params); } #include "f32-qu8-vcvt/f32-qu8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/f32-vabs.cc b/test/f32-vabs.cc index 11b95f45cc75..1fdf1307e839 100644 --- a/test/f32-vabs.cc +++ b/test/f32-vabs.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Abs()); +using TestInfo = Abs; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vabs/f32-vabs.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vclamp.cc b/test/f32-vclamp.cc index ed0f97a218d5..076755232b10 100644 --- a/test/f32-vclamp.cc +++ b/test/f32-vclamp.cc @@ -24,15 +24,50 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = Clamp; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, clamp_min) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t min = 1; min < 255; min = xnnpack::NextPrime(min)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = min; \ + params.clamp.max = 255; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} \ + \ +TEST(ukernel, clamp_max) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t max = 1; max < 255; max = xnnpack::NextPrime(max)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = 0; \ + params.clamp.max = max; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} #include "f32-vclamp/f32-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-velu.cc b/test/f32-velu.cc index 4db508d03de3..937bd6fd2a1b 100644 --- a/test/f32-velu.cc +++ b/test/f32-velu.cc @@ -24,57 +24,28 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, prescale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float prescale : std::array({0.1f, 10.0f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .prescale(prescale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, alpha) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float alpha : std::array({0.3f, 3.0f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .alpha(alpha) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, beta) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float beta : std::array({0.3f, 3.0f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .beta(beta) \ - .Test(ukernel, init_params); \ - } \ - } \ +using TestInfo = ELU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, alpha) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = std::max(1, batch_tile - 1); \ + for (float alpha : std::array({0.3f, 3.0f})) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ + xnn_unary_params params; \ + params.elu.alpha = alpha; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ } #include "f32-velu/f32-velu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vexp.cc b/test/f32-vexp.cc index 577a50951a17..c195428b9e0d 100644 --- a/test/f32-vexp.cc +++ b/test/f32-vexp.cc @@ -24,44 +24,20 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Exp()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Exp()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Exp()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Exp()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Exp()); \ -TEST(ukernel, special_values) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - constexpr size_t num_elements = 3; \ - constexpr size_t buffered_size = \ - num_elements + XNN_EXTRA_BYTES / sizeof(float); \ - std::array inputs = \ - {0.0f, -1e3f, 1e3f}; \ - std::array expected = \ - {1.0f, 0.0f, INFINITY}; \ - std::array outputs; \ - struct xnn_f32_default_params params; \ - if (init_params) { \ - init_params(¶ms); \ - } \ - ukernel( \ - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); \ - for (int i = 0; i < num_elements; i++) { \ - if (std::isfinite(expected[i])) { \ - EXPECT_NEAR( \ - expected[i], outputs[i], \ - 1 * std::abs(expected[i]) * std::numeric_limits::epsilon()) \ - << "for input " << inputs[i]; \ - } else { \ - EXPECT_EQ(std::fpclassify(expected[i]), std::fpclassify(outputs[i])) \ - << "for input " << inputs[i] << " and output " << outputs[i] \ - << " (FP_INFINITE=" << FP_INFINITE << ", FP_NAN=" << FP_NAN \ - << ", FP_NORMAL=" << FP_NORMAL << ", FP_SUBNORMAL=" << FP_SUBNORMAL \ - << ", FP_ZERO=" << FP_ZERO << ")"; \ - } \ - } \ +using TestInfo = Exp; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, special_values) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + VUnaryMicrokernelTester().Test(ukernel, init_params, \ + /*inputs=*/{0.0f, -1e3f, 1e3f}, \ + /*outputs=*/{1.0f, 0.0f, INFINITY}, \ + /*tolerance_ulp=*/1); \ } #include "f32-vexp/f32-vexp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vgelu.cc b/test/f32-vgelu.cc index 720a986c13fd..0bc9ccb5b6eb 100644 --- a/test/f32-vgelu.cc +++ b/test/f32-vgelu.cc @@ -24,44 +24,20 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Gelu()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Gelu()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Gelu()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Gelu()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Gelu()); \ -TEST(ukernel, special_values) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - constexpr size_t num_elements = 3; \ - constexpr size_t buffered_size = \ - num_elements + XNN_EXTRA_BYTES / sizeof(float); \ - std::array inputs = \ - {-6.0f, 6.0f, 0.0f}; \ - std::array expected = \ - {0.0f, 6.0f, 0.0f}; \ - std::array outputs; \ - struct xnn_f32_default_params params; \ - if (init_params) { \ - init_params(¶ms); \ - } \ - ukernel( \ - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); \ - for (int i = 0; i < num_elements; i++) { \ - if (std::isfinite(expected[i])) { \ - EXPECT_NEAR( \ - expected[i], outputs[i], \ - 1 * std::abs(expected[i]) * std::numeric_limits::epsilon()) \ - << "for input " << inputs[i]; \ - } else { \ - EXPECT_EQ(std::fpclassify(expected[i]), std::fpclassify(outputs[i])) \ - << "for input " << inputs[i] << " and output " << outputs[i] \ - << " (FP_INFINITE=" << FP_INFINITE << ", FP_NAN=" << FP_NAN \ - << ", FP_NORMAL=" << FP_NORMAL << ", FP_SUBNORMAL=" << FP_SUBNORMAL \ - << ", FP_ZERO=" << FP_ZERO << ")"; \ - } \ - } \ +using TestInfo = GELU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, special_values) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + VUnaryMicrokernelTester().Test(ukernel, init_params, \ + /*inputs=*/{-6.0f, 6.0f, 0.0f}, \ + /*outputs=*/{0.0f, 6.0f, 0.0f}, \ + /*tolerance_ulp=*/1); \ } #include "f32-vgelu/f32-vgelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vhswish.cc b/test/f32-vhswish.cc index 813441dc1976..c82ed7beeab3 100644 --- a/test/f32-vhswish.cc +++ b/test/f32-vhswish.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = HardSwish; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vhswish/f32-vhswish.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vlog.cc b/test/f32-vlog.cc index 917c8d36b2ce..3961b4231fdc 100644 --- a/test/f32-vlog.cc +++ b/test/f32-vlog.cc @@ -24,44 +24,20 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Log()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Log()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Log()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Log()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Log()); \ -TEST(ukernel, special_values) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - constexpr size_t num_elements = 4; \ - constexpr size_t buffered_size = \ - num_elements + XNN_EXTRA_BYTES / sizeof(float); \ - std::array inputs = \ - {1.0f, -1.0f, 0.0f, -0.0f}; \ - std::array expected = \ - {0.0f, NAN, -INFINITY, -INFINITY}; \ - std::array outputs; \ - struct xnn_f32_default_params params; \ - if (init_params) { \ - init_params(¶ms); \ - } \ - ukernel( \ - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); \ - for (int i = 0; i < num_elements; i++) { \ - if (std::isfinite(expected[i])) { \ - EXPECT_NEAR( \ - expected[i], outputs[i], \ - 1 * std::abs(expected[i]) * std::numeric_limits::epsilon()) \ - << "for input " << inputs[i]; \ - } else { \ - EXPECT_EQ(std::fpclassify(expected[i]), std::fpclassify(outputs[i])) \ - << "for input " << inputs[i] << " and output " << outputs[i] \ - << " (FP_INFINITE=" << FP_INFINITE << ", FP_NAN=" << FP_NAN \ - << ", FP_NORMAL=" << FP_NORMAL << ", FP_SUBNORMAL=" << FP_SUBNORMAL \ - << ", FP_ZERO=" << FP_ZERO << ")"; \ - } \ - } \ +using TestInfo = Log; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, special_values) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + VUnaryMicrokernelTester().Test(ukernel, init_params, \ + /*inputs=*/{1.0f, -1.0f, 0.0f, -0.0f}, \ + /*outputs=*/{0.0f, NAN, -INFINITY, -INFINITY}, \ + /*tolerance_ulp=*/1); \ } #include "f32-vlog/f32-vlog.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vlrelu.cc b/test/f32-vlrelu.cc index d7478e85dab1..02d3c9233783 100644 --- a/test/f32-vlrelu.cc +++ b/test/f32-vlrelu.cc @@ -24,27 +24,28 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, slope) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (float slope : std::array({-0.7f, 0.3f, 1.3f})) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .slope(slope) \ - .Test(ukernel, init_params); \ - } \ - } \ +using TestInfo = LeakyReLU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, negative_slope) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = std::max(1, batch_tile - 1); \ + for (float negative_slope : std::array({0.01f, 0.3f, 1.3f})) { \ + xnn_unary_params params; \ + params.leaky_relu.negative_slope = negative_slope; \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ } #include "f32-vlrelu/f32-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vneg.cc b/test/f32-vneg.cc index bbbe8e0660af..e1102d049003 100644 --- a/test/f32-vneg.cc +++ b/test/f32-vneg.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Neg()); +using TestInfo = Negate; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vneg/f32-vneg.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrelu.cc b/test/f32-vrelu.cc index 0d6d7c4e3386..f3f7abfa7971 100644 --- a/test/f32-vrelu.cc +++ b/test/f32-vrelu.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = ReLU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vrelu/f32-vrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndd.cc b/test/f32-vrndd.cc index cc824ea91d9f..9fd849ed8e76 100644 --- a/test/f32-vrndd.cc +++ b/test/f32-vrndd.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundDown, init_params); +using TestInfo = RoundDown; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vrnd/f32-vrndd.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndne.cc b/test/f32-vrndne.cc index c1106131b6bf..d80436efb623 100644 --- a/test/f32-vrndne.cc +++ b/test/f32-vrndne.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundToNearestEven, init_params); +using TestInfo = RoundToNearestEven; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vrnd/f32-vrndne.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndu.cc b/test/f32-vrndu.cc index 247b2cb1cedf..78a1c0f60ffe 100644 --- a/test/f32-vrndu.cc +++ b/test/f32-vrndu.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundUp, init_params); +using TestInfo = RoundUp; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vrnd/f32-vrndu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrndz.cc b/test/f32-vrndz.cc index 0808e0481730..db25937c4b81 100644 --- a/test/f32-vrndz.cc +++ b/test/f32-vrndz.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params);\ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, VUnaryMicrokernelTester::OpType::RoundTowardsZero, init_params); +using TestInfo = RoundTowardsZero; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vrnd/f32-vrndz.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vrsqrt.cc b/test/f32-vrsqrt.cc index 06b1eaccd642..b2e7260c4ffd 100644 --- a/test/f32-vrsqrt.cc +++ b/test/f32-vrsqrt.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = ReciprocalSquareRoot; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vrsqrt/f32-vrsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsigmoid.cc b/test/f32-vsigmoid.cc index e60273dc896b..149d5ba83749 100644 --- a/test/f32-vsigmoid.cc +++ b/test/f32-vsigmoid.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = Sigmoid; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vsigmoid/f32-vsigmoid.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsqr.cc b/test/f32-vsqr.cc index a2f4987c3bc3..1016a850ac07 100644 --- a/test/f32-vsqr.cc +++ b/test/f32-vsqr.cc @@ -24,13 +24,13 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params, Sqr()); +using TestInfo = Square; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } #include "f32-vsqr/f32-vsqr.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vsqrt.cc b/test/f32-vsqrt.cc index ef629a2987fd..5fcdbf628eea 100644 --- a/test/f32-vsqrt.cc +++ b/test/f32-vsqrt.cc @@ -24,44 +24,20 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, special_values) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - constexpr size_t num_elements = 4; \ - constexpr size_t buffered_size = \ - num_elements + XNN_EXTRA_BYTES / sizeof(float); \ - std::array inputs = \ - {0.0f, -0.0f, 1.0f, -1.0f}; \ - std::array expected = \ - {0.0f, -0.0f, 1.0f, NAN}; \ - std::array outputs; \ - struct xnn_f32_sqrt_params params; \ - if (init_params) { \ - init_params(¶ms); \ - } \ - ukernel( \ - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); \ - for (int i = 0; i < num_elements; i++) { \ - if (std::isfinite(expected[i])) { \ - EXPECT_NEAR( \ - expected[i], outputs[i], \ - 1 * std::abs(expected[i]) * std::numeric_limits::epsilon()) \ - << "for input " << inputs[i]; \ - } else { \ - EXPECT_EQ(std::fpclassify(expected[i]), std::fpclassify(outputs[i])) \ - << "for input " << inputs[i] << " and output " << outputs[i] \ - << " (FP_INFINITE=" << FP_INFINITE << ", FP_NAN=" << FP_NAN \ - << ", FP_NORMAL=" << FP_NORMAL << ", FP_SUBNORMAL=" << FP_SUBNORMAL \ - << ", FP_ZERO=" << FP_ZERO << ")"; \ - } \ - } \ +using TestInfo = SquareRoot; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, special_values) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + VUnaryMicrokernelTester().Test(ukernel, init_params, \ + /*inputs=*/{0.0f, -0.0f, 1.0f, -1.0f}, \ + /*outputs=*/{0.0f, -0.0f, 1.0f, NAN}, \ + /*tolerance_ulp=*/1); \ } #include "f32-vsqrt/f32-vsqrt.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/f32-vtanh.cc b/test/f32-vtanh.cc index 5d2c6723393d..439a5f481850 100644 --- a/test/f32-vtanh.cc +++ b/test/f32-vtanh.cc @@ -24,44 +24,20 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, special_values) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - constexpr size_t num_elements = 7; \ - constexpr size_t buffered_size = \ - num_elements + XNN_EXTRA_BYTES / sizeof(float); \ - std::array inputs = \ - {0.0f, -0.0f, 10.0f, -10.0f, INFINITY, -INFINITY, NAN}; \ - std::array expected = \ - {0.0f, -0.0f, 1.0f, -1.0f, 1.0f, -1.0f, NAN}; \ - std::array outputs; \ - union xnn_f32_tanh_params params; \ - if (init_params) { \ - init_params(¶ms); \ - } \ - ukernel( \ - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); \ - for (int i = 0; i < num_elements; i++) { \ - if (std::isfinite(expected[i])) { \ - EXPECT_NEAR( \ - expected[i], outputs[i], \ - 3 * std::abs(expected[i]) * std::numeric_limits::epsilon()) \ - << "for input " << inputs[i]; \ - } else { \ - EXPECT_EQ(std::fpclassify(expected[i]), std::fpclassify(outputs[i])) \ - << "for input " << inputs[i] << " and output " << outputs[i] \ - << " (FP_INFINITE=" << FP_INFINITE << ", FP_NAN=" << FP_NAN \ - << ", FP_NORMAL=" << FP_NORMAL << ", FP_SUBNORMAL=" << FP_SUBNORMAL \ - << ", FP_ZERO=" << FP_ZERO << ")"; \ - } \ - } \ +using TestInfo = TanH; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, special_values) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + VUnaryMicrokernelTester().Test(ukernel, init_params, \ + /*inputs=*/{0.0f, -0.0f, 10.0f, -10.0f, INFINITY, -INFINITY, NAN}, \ + /*outputs=*/{0.0f, -0.0f, 1.0f, -1.0f, 1.0f, -1.0f, NAN}, \ + /*tolerance_ulp=*/3); \ } #include "f32-vtanh/f32-vtanh.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/floor-nc.cc b/test/floor-nc.cc deleted file mode 100644 index 642745832434..000000000000 --- a/test/floor-nc.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class FloorOperatorTester : public UnaryOperatorTester { - public: - FloorOperatorTester() : UnaryOperatorTester() { - range_f32_ = {0.0f, 5.0f}; - range_f16_ = {0.0f, 5.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::floor(x); } - - CREATE_OP_OVERRIDES_F32(floor); - CREATE_OP_OVERRIDES_F16(floor); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, FloorOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, FloorOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, FloorOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/floor.cc b/test/floor.cc deleted file mode 100644 index bea533cabaf6..000000000000 --- a/test/floor.cc +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using FloorTestF16 = UnaryTest; -using FloorTestF32 = UnaryTest; - -TEST_F(FloorTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_floor(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_floor); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(FloorTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_floor(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_floor); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(FloorTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(0.0f, 5.00f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_floor_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_floor_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_floor_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_floor(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(FloorTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(0.0f, 5.00f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_floor_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_floor_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_floor_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_floor(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/fully-connected.cc b/test/fully-connected.cc index f5948f2c89fd..6a4d831d6834 100644 --- a/test/fully-connected.cc +++ b/test/fully-connected.cc @@ -464,7 +464,7 @@ TEST_F(FullyConnectedTestQP8F32QC4W, matches_operator_api) { ASSERT_NE(output_id, XNN_INVALID_NODE_ID); ASSERT_EQ(xnn_status_success, - xnn_define_convert(subgraph, input_id, dq_quantized_id, + xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/XNN_FLAG_MAYBE_PACK_FOR_GEMM)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, @@ -630,7 +630,7 @@ TEST_F(FullyConnectedTestQP8F32QC4W, matches_operator_api_with_reshape) { xnn_runtime_t runtime = nullptr; ASSERT_EQ(xnn_status_success, - xnn_define_convert(subgraph, input_id, dq_quantized_id, + xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/XNN_FLAG_MAYBE_PACK_FOR_GEMM)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected( @@ -812,7 +812,7 @@ TEST_F(FullyConnectedTestQP8F32QC4W, matches_operator_api_transposed_weights) { xnn_runtime_t runtime = nullptr; ASSERT_EQ(xnn_status_success, - xnn_define_convert(subgraph, input_id, dq_quantized_id, + xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/XNN_FLAG_MAYBE_PACK_FOR_GEMM)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, @@ -2926,7 +2926,7 @@ TEST_F(FullyConnectedTestQD8F16QC4W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -3108,7 +3108,7 @@ TEST_F(FullyConnectedTestQD8F16QB4W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -3266,7 +3266,7 @@ TEST_F(FullyConnectedTestQD8F16QC8W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -3427,7 +3427,7 @@ TEST_F(FullyConnectedTestQD8F32QC8W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -3599,7 +3599,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -3716,7 +3716,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, XNN_INVALID_NODE_ID, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -3868,7 +3868,7 @@ TEST_F(FullyConnectedTestQD8F32QC4W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, XNN_FLAG_TRANSPOSE_WEIGHTS)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -4050,7 +4050,7 @@ TEST_F(FullyConnectedTestQD8F32QB4W, internally_allocated_dynamic_quantization_p ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); @@ -4343,7 +4343,7 @@ TEST_F(FullyConnectedTestQP8F32QB4W, matches_operator_api) ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); diff --git a/test/gelu-nc.cc b/test/gelu-nc.cc deleted file mode 100644 index ff8f3580fe0b..000000000000 --- a/test/gelu-nc.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class GELUOperatorTester : public UnaryOperatorTester { - public: - GELUOperatorTester() : UnaryOperatorTester() { range_f32_ = {-20.0f, 20.0f}; } - - protected: - float AbsTolF32(float y_ref) const override { - return std::max(std::abs(y_ref) * 5.0e-6f, 5.0e-6f); - }; - - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { - return x * 0.5f * (1.0f + std::erf(x / std::sqrt(2.0f))); - } - - CREATE_OP_OVERRIDES_F32(gelu); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, GELUOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, GELUOperatorTester); - -}; // namespace xnnpack diff --git a/test/gelu.cc b/test/gelu.cc deleted file mode 100644 index 54351506f2d9..000000000000 --- a/test/gelu.cc +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using GeluTestF32 = UnaryTest; - -TEST_F(GeluTestF32, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, - /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph( - subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, - &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, - &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, - xnn_define_gelu(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_gelu); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(GeluTestF32, matches_operator_api) { - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_gelu_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op( - op, xnn_delete_operator); - - ASSERT_EQ( - xnn_status_success, - xnn_reshape_gelu_nc_f32(op, batch_size, channels, channels, channels, - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, - xnn_setup_gelu_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, - /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph( - subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, - &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, - &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, - xnn_define_gelu(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ( - xnn_status_success, - xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime( - runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, - xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/hardswish-nc.cc b/test/hardswish-nc.cc deleted file mode 100644 index a578d0dfea4f..000000000000 --- a/test/hardswish-nc.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class HardSwishOperatorTester : public UnaryOperatorTester { - public: - HardSwishOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-1.0f, 1.0f}; - range_f16_ = {-1.0f, 1.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { - return x * std::min(std::max(x + 3.0f, 0.0f), 6.0f) / 6.0f; - ; - } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float y_ref) const override { - return std::max(1.0e-7f, std::abs(y_ref) * 1.0e-6f); - }; - float AbsTolF16(float y_ref) const override { - return std::max(1.0e-3f, std::abs(y_ref) * 1.0e-2f); - }; - - CREATE_OP_OVERRIDES_F32(hardswish); - CREATE_OP_OVERRIDES_F16(hardswish); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, HardSwishOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, HardSwishOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, HardSwishOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/hardswish.cc b/test/hardswish.cc deleted file mode 100644 index d026b2d7fa82..000000000000 --- a/test/hardswish.cc +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include // For std::generate. -#include // For std::array. -#include -#include // For size_t. -#include -#include // For std::unique_ptr. -#include // For std::uniform_real_distribution. -#include // For std::vector. - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "replicable_random_device.h" -#include "subgraph-unary-tester.h" - -using HardSwishTestF16 = UnaryTest; -using HardSwishTestF32 = UnaryTest; - -TEST_F(HardSwishTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(0, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - const std::array dims = {{1, 3, 5}}; - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_hardswish(subgraph, input_id, output_id, 0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_hardswish); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(HardSwishTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(0, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - const std::array dims = {{1, 3, 5}}; - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, - xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_hardswish(subgraph, input_id, output_id, 0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_hardswish); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(HardSwishTestF16, matches_operator_api) -{ - xnnpack::Buffer input(num_output_elements + XNN_EXTRA_BYTES / sizeof(xnn_float16), std::nanf("")); - std::uniform_real_distribution f32dist(-4.0f, 4.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - xnnpack::Buffer subgraph_output(num_output_elements, std::nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_hardswish_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - xnnpack::Buffer operator_output(num_output_elements, std::nanf("")); - ASSERT_EQ(xnn_status_success, xnn_reshape_hardswish_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_hardswish_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - ASSERT_NE(nullptr, subgraph); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - ASSERT_EQ(xnn_status_success, xnn_define_hardswish(subgraph, input_id, output_id, /*flags=*/0)); - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, - xnn_external_value{output_id, subgraph_output.data()} - }; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check outputs match. - for (size_t i = 0; i < num_output_elements; i++) { - ASSERT_EQ(subgraph_output[i], operator_output[i]); - } -} - -TEST_F(HardSwishTestF32, matches_operator_api) -{ - xnnpack::Buffer input(num_output_elements + XNN_EXTRA_BYTES / sizeof(float), std::nanf("")); - std::uniform_real_distribution f32dist(-4.0f, 4.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - xnnpack::Buffer subgraph_output(num_output_elements, std::nanf("")); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - xnn_status status = xnn_create_hardswish_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - xnnpack::Buffer operator_output(num_output_elements, std::nanf("")); - ASSERT_EQ(xnn_status_success, xnn_reshape_hardswish_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_hardswish_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - ASSERT_NE(nullptr, subgraph); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - uint32_t input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - uint32_t output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - ASSERT_EQ(xnn_status_success, xnn_define_hardswish(subgraph, input_id, output_id, /*flags=*/0)); - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, - xnn_external_value{output_id, subgraph_output.data()} - }; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - // Check outputs match. - for (size_t i = 0; i < num_output_elements; i++) { - ASSERT_EQ(subgraph_output[i], operator_output[i]); - } -} diff --git a/test/leaky-relu-nc.cc b/test/leaky-relu-nc.cc deleted file mode 100644 index c1468975b5cd..000000000000 --- a/test/leaky-relu-nc.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "unary-operator-tester.h" -#include "pthreadpool.h" - -namespace xnnpack { - -class LeakyReLUOperatorTester : public UnaryOperatorTester { - public: - LeakyReLUOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-20.0f, 20.0f}; - range_f16_ = {-25.0f, 25.0f}; - input_scale(1.25f); - input_zero_point(41); - output_scale(0.75f); - output_zero_point(53); - } - - LeakyReLUOperatorTester& negative_slope(float negative_slope) { - assert(std::isnormal(negative_slope)); - this->negative_slope_ = negative_slope; - return *this; - } - - inline float negative_slope() const { return this->negative_slope_; } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { - return std::signbit(x) ? x * negative_slope() : x; - } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float) const override { return 5e-6f; } - float AbsTolF16(float y_ref) const override { - return std::max(1.0e-4f, std::abs(y_ref) * 5.0e-3f); - } - float AbsTolQS8(float) const override { return 0.9f; }; - float AbsTolQU8(float) const override { return 0.9f; }; - - xnn_status CreateOpF32(uint32_t flags, - xnn_operator_t* op_out) const override { - return xnn_create_leaky_relu_nc_f32(negative_slope(), flags, op_out); - } - CREATE_OP_RESHAPE_OVERRIDE_F32(leaky_relu); - CREATE_OP_SETUP_OVERRIDE_F32(leaky_relu); - xnn_status RunOpF32(size_t channels, size_t input_stride, - size_t output_stride, size_t batch_size, - const float* input, float* output, uint32_t flags, - pthreadpool_t threadpool) const override { - return xnn_run_leaky_relu_nc_f32(channels, input_stride, output_stride, - batch_size, input, output, - negative_slope(), flags, threadpool); - } - - xnn_status CreateOpF16(uint32_t flags, - xnn_operator_t* op_out) const override { - return xnn_create_leaky_relu_nc_f16(negative_slope(), flags, op_out); - } - CREATE_OP_RESHAPE_OVERRIDE_F16(leaky_relu); - CREATE_OP_SETUP_OVERRIDE_F16(leaky_relu); - - xnn_status CreateOpQS8(int8_t input_zero_point, float input_scale, - int8_t output_zero_point, float output_scale, - int8_t output_min, int8_t output_max, uint32_t flags, - xnn_operator_t* op_out) const override { - return xnn_create_leaky_relu_nc_qs8(negative_slope(), input_zero_point, - input_scale, output_zero_point, - output_scale, flags, op_out); - } - CREATE_OP_RESHAPE_OVERRIDE_QS8(leaky_relu); - CREATE_OP_SETUP_OVERRIDE_QS8(leaky_relu); - - xnn_status CreateOpQU8(uint8_t input_zero_point, float input_scale, - uint8_t output_zero_point, float output_scale, - uint8_t output_min, uint8_t output_max, uint32_t flags, - xnn_operator_t* op_out) const override { - return xnn_create_leaky_relu_nc_qu8(negative_slope(), input_zero_point, - input_scale, output_zero_point, - output_scale, flags, op_out); - } - CREATE_OP_RESHAPE_OVERRIDE_QU8(leaky_relu); - CREATE_OP_SETUP_OVERRIDE_QU8(leaky_relu); - - private: - float negative_slope_ = 0.3f; -}; - -CREATE_UNARY_FLOAT_TESTS(F32, LeakyReLUOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, LeakyReLUOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, LeakyReLUOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -CREATE_UNARY_QUANTIZED_TESTS_NO_QMIN(QS8, LeakyReLUOperatorTester); -CREATE_UNARY_QUANTIZED_TESTS_NO_QMIN(QU8, LeakyReLUOperatorTester); - -#ifndef XNN_EXCLUDE_F16_TESTS -TEST(LEAKY_RELU_NC_F16, small_batch_with_negative_slope) { - for (size_t batch_size = 1; batch_size <= 3; batch_size += 2) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float negative_slope : {-10.0f, -1.0f, -0.1f, 0.1f, 10.0f}) { - LeakyReLUOperatorTester() - .negative_slope(negative_slope) - .batch_size(3) - .channels(channels) - .iterations(1) - .TestF16(); - } - } - } -} -#endif // XNN_EXCLUDE_F16_TESTS - -TEST(LEAKY_RELU_NC_F32, small_batch_with_negative_slope) { - for (size_t batch_size = 1; batch_size <= 3; batch_size += 2) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float negative_slope : {-10.0f, -1.0f, -0.1f, 0.1f, 10.0f}) { - LeakyReLUOperatorTester() - .negative_slope(negative_slope) - .batch_size(3) - .channels(channels) - .iterations(1) - .TestF32(); - } - } - } -} - -TEST(LEAKY_RELU_NC_QS8, unit_batch_with_negative_slope) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float negative_slope : {-10.0f, -1.0f, -0.1f, 0.1f, 10.0f}) { - LeakyReLUOperatorTester() - .negative_slope(negative_slope) - .batch_size(1) - .channels(channels) - .iterations(1) - .TestQS8(); - } - } -} - -TEST(LEAKY_RELU_NC_QS8, unit_batch_with_output_scale) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float output_scale = 1.0e-2f; output_scale < 1.0e+2f; - output_scale *= 3.14159265f) { - LeakyReLUOperatorTester() - .output_scale(output_scale) - .batch_size(1) - .channels(channels) - .iterations(1) - .TestQS8(); - } - } -} - -TEST(LEAKY_RELU_NC_QS8, unit_batch_with_output_zero_point) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int16_t output_zero_point = 0; output_zero_point <= 255; - output_zero_point += 51) { - LeakyReLUOperatorTester() - .output_zero_point(output_zero_point) - .batch_size(1) - .channels(channels) - .iterations(1) - .TestQS8(); - } - } -} - -TEST(LEAKY_RELU_NC_QU8, unit_batch_with_negative_slope) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float negative_slope : {-10.0f, -1.0f, -0.1f, 0.1f, 10.0f}) { - LeakyReLUOperatorTester() - .negative_slope(negative_slope) - .batch_size(1) - .channels(channels) - .iterations(1) - .TestQU8(); - } - } -} - -TEST(LEAKY_RELU_NC_QU8, unit_batch_with_output_scale) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (float output_scale = 1.0e-2f; output_scale < 1.0e+2f; - output_scale *= 3.14159265f) { - LeakyReLUOperatorTester() - .output_scale(output_scale) - .batch_size(1) - .channels(channels) - .iterations(1) - .TestQU8(); - } - } -} - -TEST(LEAKY_RELU_NC_QU8, unit_batch_with_output_zero_point) { - for (size_t channels = 1; channels < 100; channels += 15) { - for (int16_t output_zero_point = 0; output_zero_point <= 255; - output_zero_point += 51) { - LeakyReLUOperatorTester() - .output_zero_point(output_zero_point) - .batch_size(1) - .channels(channels) - .iterations(1) - .TestQU8(); - } - } -} - -}; // namespace xnnpack diff --git a/test/leaky-relu.cc b/test/leaky-relu.cc deleted file mode 100644 index 9ebdc0de3e93..000000000000 --- a/test/leaky-relu.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using LeakyReLUTestF16 = UnaryTest; -using LeakyReLUTestF32 = UnaryTest; -using LeakyReLUTestQS8 = UnaryTest; -using LeakyReLUTestQU8 = UnaryTest; - -TEST_F(LeakyReLUTestF16, define) -{ - const float negative_slope = std::uniform_real_distribution(0.5f, 1.2f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_leaky_relu); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->params.leaky_relu.negative_slope, negative_slope); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(LeakyReLUTestF32, define) -{ - const float negative_slope = std::uniform_real_distribution(0.5f, 1.2f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_leaky_relu); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->params.leaky_relu.negative_slope, negative_slope); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(LeakyReLUTestQS8, define) -{ - const float negative_slope = std::uniform_real_distribution(0.5f, 1.2f)(rng); - - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = i8dist(rng); - const float output_scale = scale_dist(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_leaky_relu); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8); - ASSERT_EQ(node->params.leaky_relu.negative_slope, negative_slope); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(LeakyReLUTestQU8, define) -{ - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = u8dist(rng); - const float output_scale = scale_dist(rng); - const float negative_slope = std::uniform_real_distribution(0.5f, 1.2f)(rng); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_leaky_relu); - ASSERT_EQ(node->compute_type, xnn_compute_type_qu8); - ASSERT_EQ(node->params.leaky_relu.negative_slope, negative_slope); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(LeakyReLUTestF16, matches_operator_api) -{ - const float negative_slope = std::uniform_real_distribution(0.1f, 10.0f)(rng); - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_leaky_relu_nc_f16(negative_slope, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_leaky_relu_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_leaky_relu_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(LeakyReLUTestF32, matches_operator_api) -{ - const float negative_slope = std::uniform_real_distribution(0.1f, 10.0f)(rng); - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_leaky_relu_nc_f32(negative_slope, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_leaky_relu_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_leaky_relu_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(LeakyReLUTestQS8, matches_operator_api) -{ - const float negative_slope = std::uniform_real_distribution(0.5f, 1.0f)(rng); - - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = i8dist(rng); - const float output_scale = scale_dist(rng); - - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_leaky_relu_nc_qs8( - negative_slope, - input_zero_point, input_scale, output_zero_point, output_scale, - /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_leaky_relu_nc_qs8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_leaky_relu_nc_qs8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(LeakyReLUTestQU8, matches_operator_api) -{ - const float negative_slope = std::uniform_real_distribution(0.5f, 1.0f)(rng); - - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = u8dist(rng); - const float output_scale = scale_dist(rng); - - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_leaky_relu_nc_qu8( - negative_slope, - input_zero_point, input_scale, output_zero_point, output_scale, - /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_leaky_relu_nc_qu8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_leaky_relu_nc_qu8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_leaky_relu(subgraph, negative_slope, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/log-nc.cc b/test/log-nc.cc deleted file mode 100644 index b0616e799867..000000000000 --- a/test/log-nc.cc +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class LogOperatorTester : public UnaryOperatorTester { - public: - LogOperatorTester() : UnaryOperatorTester() { - range_f32_ = {0.f, 10.0f}; - range_f16_ = {0.f, 10.0f}; - } - - protected: - float AbsTolF32(float y_ref) const override { - return std::max( - 2 * std::numeric_limits::epsilon(), - std::abs(y_ref) * 6 * std::numeric_limits::epsilon()); - }; - - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::log(x); } - - CREATE_STANDARD_OP_OVERRIDES_F32(log); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, LogOperatorTester); - -}; // namespace xnnpack diff --git a/test/log.cc b/test/log.cc deleted file mode 100644 index f2b77745c805..000000000000 --- a/test/log.cc +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using LogTestF32 = UnaryTest; - -TEST_F(LogTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_log(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_log); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(LogTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(0.f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_log_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_log_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_log_nc_f32(op, input.data(), operator_output.data())); - - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_log(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/negate-nc.cc b/test/negate-nc.cc deleted file mode 100644 index a7d7d768e5c0..000000000000 --- a/test/negate-nc.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class NegateOperatorTester : public UnaryOperatorTester { - public: - NegateOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-1.0f, 1.0f}; - range_f16_ = {-1.0f, 1.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return -x; } - - CREATE_OP_OVERRIDES_F32(negate); - CREATE_OP_OVERRIDES_F16(negate); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, NegateOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, NegateOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, NegateOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/negate.cc b/test/negate.cc deleted file mode 100644 index f31ff9ac25e3..000000000000 --- a/test/negate.cc +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using NegateTestF16 = UnaryTest; -using NegateTestF32 = UnaryTest; - -TEST_F(NegateTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_negate(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_negate); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(NegateTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_negate(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_negate); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(NegateTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_negate_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_negate_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_negate_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_negate(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(NegateTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_negate_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_negate_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_negate_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_negate(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/qs16-qs8-vcvt.cc b/test/qs16-qs8-vcvt.cc index f947e0c9628e..ec4e87d9c32d 100644 --- a/test/qs16-qs8-vcvt.cc +++ b/test/qs16-qs8-vcvt.cc @@ -2,26 +2,20 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: qs16-qs8-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ - \ -XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } +// TODO: Should there be testing of input scale/zero point? Or rename qs16 -> s16 in this kernel? #include "qs16-qs8-vcvt/qs16-qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-f16-vcvt.cc b/test/qs8-f16-vcvt.cc index f410d3027bb1..7d4a0354e4e6 100644 --- a/test/qs8-f16-vcvt.cc +++ b/test/qs8-f16-vcvt.cc @@ -2,25 +2,19 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: qs8-f16-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qs8-f16-vcvt/qs8-f16-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-f32-vcvt.cc b/test/qs8-f32-vcvt.cc index 9415e7df8a10..bf4de77051af 100644 --- a/test/qs8-f32-vcvt.cc +++ b/test/qs8-f32-vcvt.cc @@ -2,25 +2,19 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: qs8-f32-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qs8-f32-vcvt/qs8-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vcvt.cc b/test/qs8-vcvt.cc index 4716a82d19c7..ce17e5f94803 100644 --- a/test/qs8-vcvt.cc +++ b/test/qs8-vcvt.cc @@ -2,25 +2,21 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: qs8-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qs8-vcvt/qs8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vhswish.cc b/test/qs8-vhswish.cc index 093c95ab5d61..8f25b0ea1f75 100644 --- a/test/qs8-vhswish.cc +++ b/test/qs8-vhswish.cc @@ -22,62 +22,19 @@ #include "xnnpack/microparams.h" #include "xnnpack/vunary.h" #include "next_prime.h" -#include "vhswish-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, input_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - for (float input_scale : {4.0f, 16.0f, 64.0f}) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .input_scale(input_scale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, output_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - for (float output_scale : {4.0f, 16.0f, 64.0f}) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .output_scale(output_scale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, input_zero_point) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .input_zero_point(input_zero_point) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, output_zero_point) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .output_zero_point(output_zero_point) \ - .Test(ukernel, init_params); \ - } \ - } \ -} +using TestInfo = HardSwish; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qs8-vhswish/qs8-vhswish.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qs8-vlrelu.cc b/test/qs8-vlrelu.cc index 650b91404c04..39843cf28150 100644 --- a/test/qs8-vlrelu.cc +++ b/test/qs8-vlrelu.cc @@ -22,38 +22,34 @@ #include "xnnpack/microparams.h" #include "xnnpack/vunary.h" #include "next_prime.h" -#include "vlrelu-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, positive_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= batch_tile * 5; batch_size += std::max(1, batch_tile - 1)) { \ - for (float positive_scale : {1.0f / 256.0f, 0.3f, 1.3f, 128.0f}) { \ - VLReLUMicrokernelTester() \ - .batch_size(batch_size) \ - .positive_scale(positive_scale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, negative_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= batch_tile * 5; batch_size += std::max(1, batch_tile - 1)) { \ - for (float negative_scale : {-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f}) {\ - VLReLUMicrokernelTester() \ - .batch_size(batch_size) \ - .negative_scale(negative_scale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} +using TestInfo = LeakyReLU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, negative_slope) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = std::max(1, batch_tile - 1); \ + for (float negative_slope : std::array({0.01f, 0.3f, 1.3f})) { \ + xnn_unary_params params; \ + params.leaky_relu.negative_slope = negative_slope; \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} \ +TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qs8-vlrelu/qs8-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-f32-vcvt.cc b/test/qu8-f32-vcvt.cc index 909563c8e756..fbc441eb58f5 100644 --- a/test/qu8-f32-vcvt.cc +++ b/test/qu8-f32-vcvt.cc @@ -2,25 +2,19 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: qu8-f32-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qu8-f32-vcvt/qu8-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vcvt.cc b/test/qu8-vcvt.cc index b0e061a70b48..cb974d7930cf 100644 --- a/test/qu8-vcvt.cc +++ b/test/qu8-vcvt.cc @@ -2,25 +2,21 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: qu8-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ - \ -XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qu8-vcvt/qu8-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vhswish.cc b/test/qu8-vhswish.cc index e6e76ed7dccc..f408237650ae 100644 --- a/test/qu8-vhswish.cc +++ b/test/qu8-vhswish.cc @@ -22,68 +22,19 @@ #include "xnnpack/microparams.h" #include "xnnpack/vunary.h" #include "next_prime.h" -#include "vhswish-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, input_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - for (float input_scale : {4.0f, 16.0f, 64.0f}) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .input_scale(input_scale) \ - .input_zero_point(150) \ - .output_zero_point(100) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, output_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - for (float output_scale : {4.0f, 16.0f, 64.0f}) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .output_scale(output_scale) \ - .input_zero_point(150) \ - .output_zero_point(100) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, input_zero_point) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .input_zero_point(input_zero_point) \ - .output_zero_point(100) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, output_zero_point) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { \ - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .input_zero_point(150) \ - .output_zero_point(output_zero_point) \ - .Test(ukernel, init_params); \ - } \ - } \ -} +using TestInfo = HardSwish; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qu8-vhswish/qu8-vhswish.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/qu8-vlrelu.cc b/test/qu8-vlrelu.cc index 39b9947ff4c5..50ef80a1f29d 100644 --- a/test/qu8-vlrelu.cc +++ b/test/qu8-vlrelu.cc @@ -22,38 +22,34 @@ #include "xnnpack/microparams.h" #include "xnnpack/vunary.h" #include "next_prime.h" -#include "vlrelu-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -TEST(ukernel, positive_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= batch_tile * 5; batch_size += std::max(1, batch_tile - 1)) { \ - for (float positive_scale : {1.0f / 256.0f, 0.3f, 1.3f, 128.0f}) { \ - VLReLUMicrokernelTester() \ - .batch_size(batch_size) \ - .positive_scale(positive_scale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} \ - \ -TEST(ukernel, negative_scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - for (size_t batch_size = 1; batch_size <= batch_tile * 5; batch_size += std::max(1, batch_tile - 1)) { \ - for (float negative_scale : {-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f}) {\ - VLReLUMicrokernelTester() \ - .batch_size(batch_size) \ - .negative_scale(negative_scale) \ - .Test(ukernel, init_params); \ - } \ - } \ -} +using TestInfo = LeakyReLU; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, negative_slope) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = std::max(1, batch_tile - 1); \ + for (float negative_slope : std::array({0.01f, 0.3f, 1.3f})) { \ + xnn_unary_params params; \ + params.leaky_relu.negative_slope = negative_slope; \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} \ +TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "qu8-vlrelu/qu8-vlrelu.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/reciprocal-square-root-nc.cc b/test/reciprocal-square-root-nc.cc deleted file mode 100644 index ebbcfad67119..000000000000 --- a/test/reciprocal-square-root-nc.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class ReciprocalSquareRootOperatorTester : public UnaryOperatorTester { - public: - ReciprocalSquareRootOperatorTester() : UnaryOperatorTester() { - range_f32_ = {0.001f, 5.0f}; - range_f16_ = {0.001f, 5.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return 1.0f / std::sqrt(x); } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float y_ref) const override { - return y_ref * std::numeric_limits::epsilon() * 2; - } - float AbsTolF16(float y_ref) const override { - return std::abs(y_ref) * 5.0e-3f; - } - - CREATE_OP_OVERRIDES_F32(reciprocal_square_root); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, ReciprocalSquareRootOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, ReciprocalSquareRootOperatorTester); - -}; // namespace xnnpack diff --git a/test/reciprocal-square-root.cc b/test/reciprocal-square-root.cc deleted file mode 100644 index bebfdd1c85ab..000000000000 --- a/test/reciprocal-square-root.cc +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using ReciprocalSquareRootTestF16 = UnaryTest; -using ReciprocalSquareRootTestF32 = UnaryTest; - -TEST_F(ReciprocalSquareRootTestF16, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, - /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph( - subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp16, dims.size(), - dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, - &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp16, dims.size(), - dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, - &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, - xnn_define_reciprocal_square_root(subgraph, input_id, output_id, - /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_reciprocal_square_root); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ReciprocalSquareRootTestF32, define) { - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, - /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph( - subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, - &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, - &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, - xnn_define_reciprocal_square_root(subgraph, input_id, output_id, - /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_reciprocal_square_root); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(ReciprocalSquareRootTestF16, matches_operator_api) { - std::uniform_real_distribution f32dist(0.1f, 5.0f); - std::generate(input.begin(), input.end(), - [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_reciprocal_square_root_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op( - op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_reciprocal_square_root_nc_f16( - op, batch_size, channels, channels, - channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_reciprocal_square_root_nc_f16( - op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, - /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph( - subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp16, dims.size(), - dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, - &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp16, dims.size(), - dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, - &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, - xnn_define_reciprocal_square_root(subgraph, input_id, output_id, - /*flags=*/0)); - ASSERT_EQ( - xnn_status_success, - xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime( - runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, - xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(ReciprocalSquareRootTestF32, matches_operator_api) { - std::uniform_real_distribution f32dist(0.1f, 5.0f); - std::generate(input.begin(), input.end(), - [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_reciprocal_square_root_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op( - op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_reciprocal_square_root_nc_f32( - op, batch_size, channels, channels, - channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_reciprocal_square_root_nc_f32( - op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, - /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph( - subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, - &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ(xnn_status_success, - xnn_define_tensor_value(subgraph, xnn_datatype_fp32, dims.size(), - dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, - &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, - xnn_define_reciprocal_square_root(subgraph, input_id, output_id, - /*flags=*/0)); - ASSERT_EQ( - xnn_status_success, - xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime( - runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, - xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, - xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/s32-f32-vcvt.cc b/test/s32-f32-vcvt.cc index 26f3916b37e3..14a75f37c4bc 100644 --- a/test/s32-f32-vcvt.cc +++ b/test/s32-f32-vcvt.cc @@ -2,22 +2,18 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Microkernel: s32-f32-vcvt -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "s32-f32-vcvt/s32-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/s8-vclamp.cc b/test/s8-vclamp.cc index ee0c42e78e3b..9d7c7ea3b107 100644 --- a/test/s8-vclamp.cc +++ b/test/s8-vclamp.cc @@ -24,15 +24,50 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = Clamp; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, clamp_min) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t min = 1; min < 255; min = xnnpack::NextPrime(min)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = min; \ + params.clamp.max = 255; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} \ + \ +TEST(ukernel, clamp_max) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t max = 1; max < 255; max = xnnpack::NextPrime(max)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = 0; \ + params.clamp.max = max; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} #include "s8-vclamp/s8-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/sigmoid-nc.cc b/test/sigmoid-nc.cc deleted file mode 100644 index 03a4eb2ed170..000000000000 --- a/test/sigmoid-nc.cc +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class SigmoidOperatorTester : public UnaryOperatorTester { - public: - SigmoidOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-25.0f, 25.0f}; - range_f16_ = {-25.0f, 25.0f}; - output_scale(1.0f / 256.0f); - output_zero_point(0); - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { - return 1.0 / (1.0 + std::exp(static_cast(-x))); - } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float) const override { return 5e-6f; } - float AbsTolF16(float y_ref) const override { - return std::max(1.0e-4f, std::abs(y_ref) * 5.0e-3f); - } - float AbsTolQS8(float) const override { return 0.6f; }; - float AbsTolQU8(float) const override { return 0.6f; }; - - CREATE_OP_OVERRIDES_F32(sigmoid); - CREATE_OP_OVERRIDES_F16(sigmoid); - CREATE_OP_OVERRIDES_QS8(sigmoid); - CREATE_OP_OVERRIDES_QU8(sigmoid); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, SigmoidOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, SigmoidOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, SigmoidOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -CREATE_UNARY_QUANTIZED_TESTS(QS8, SigmoidOperatorTester); -CREATE_UNARY_QUANTIZED_TESTS(QU8, SigmoidOperatorTester); - -}; // namespace xnnpack diff --git a/test/sigmoid.cc b/test/sigmoid.cc deleted file mode 100644 index b8ffe05bc248..000000000000 --- a/test/sigmoid.cc +++ /dev/null @@ -1,404 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using SigmoidTestQS8 = UnaryTest; -using SigmoidTestQU8 = UnaryTest; -using SigmoidTestF16 = UnaryTest; -using SigmoidTestF32 = UnaryTest; - -TEST_F(SigmoidTestQS8, define) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = 0; - const float output_scale = 0x1.0p-8f; - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_sigmoid); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SigmoidTestQU8, define) -{ - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = 0; - const float output_scale = 0x1.0p-8f; - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_sigmoid); - ASSERT_EQ(node->compute_type, xnn_compute_type_qu8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SigmoidTestF16, define) -{ - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_sigmoid); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SigmoidTestF32, define) -{ - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_sigmoid); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SigmoidTestQS8, matches_operator_api) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = INT8_MIN; - const float output_scale = 0x1.0p-8f; - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_sigmoid_nc_qs8( - input_zero_point, input_scale, output_zero_point, output_scale, INT8_MIN, - INT8_MAX, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_sigmoid_nc_qs8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_sigmoid_nc_qs8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(SigmoidTestQU8, matches_operator_api) -{ - - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = 0; - const float output_scale = 0x1.0p-8f; - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_sigmoid_nc_qu8( - input_zero_point, input_scale, output_zero_point, output_scale, 0, - UINT8_MAX, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_sigmoid_nc_qu8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_sigmoid_nc_qu8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(SigmoidTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-25.0f, 25.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_sigmoid_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_sigmoid_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_sigmoid_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(SigmoidTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-25.0f, 25.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_sigmoid_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_sigmoid_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_sigmoid_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_sigmoid(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/square-nc.cc b/test/square-nc.cc deleted file mode 100644 index b31262b73c37..000000000000 --- a/test/square-nc.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class SquareOperatorTester : public UnaryOperatorTester { - public: - SquareOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-1.0f, 1.0f}; - range_f16_ = {-1.0f, 1.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return x * x; } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float y_ref) const override { return 0.0f; } - float AbsTolF16(float y_ref) const override { - return std::max(1.0e-4f, std::abs(y_ref) * 5.0e-3f); - } - - CREATE_OP_OVERRIDES_F32(square); - CREATE_OP_OVERRIDES_F16(square); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, SquareOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, SquareOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, SquareOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/square-root-nc.cc b/test/square-root-nc.cc deleted file mode 100644 index 97553f49b3ac..000000000000 --- a/test/square-root-nc.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class SquareRootOperatorTester : public UnaryOperatorTester { - public: - SquareRootOperatorTester() : UnaryOperatorTester() { - range_f32_ = {0.0f, 0.5f}; - range_f16_ = {0.1f, 5.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::sqrt(x); } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float y_ref) const override { - return std::abs(y_ref) * 2.0f * std::numeric_limits::epsilon(); - } - float AbsTolF16(float y_ref) const override { - return std::abs(y_ref) * 5.0e-3f; - } - - CREATE_OP_OVERRIDES_F32(square_root); - CREATE_OP_OVERRIDES_F16(square_root); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, SquareRootOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, SquareRootOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, SquareRootOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/square-root.cc b/test/square-root.cc deleted file mode 100644 index 62ac4780330f..000000000000 --- a/test/square-root.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using SquareRootTestF16 = UnaryTest; -using SquareRootTestF32 = UnaryTest; - -TEST_F(SquareRootTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_square_root(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_square_root); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SquareRootTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_square_root(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_square_root); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SquareRootTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(0.1f, 5.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_square_root_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_square_root_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_square_root_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_square_root(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(SquareRootTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(0.1f, 5.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_square_root_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_square_root_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_square_root_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_square_root(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/square.cc b/test/square.cc deleted file mode 100644 index da2908cb444e..000000000000 --- a/test/square.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using SquareTestF16 = UnaryTest; -using SquareTestF32 = UnaryTest; - -TEST_F(SquareTestF16, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_square(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_square); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SquareTestF32, define) -{ - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_square(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_square); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(SquareTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_square_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_square_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_square_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_square(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(SquareTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-255.0f, 255.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = - xnn_create_square_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_square_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_square_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_square(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/subgraph-tester.h b/test/subgraph-tester.h index 9292242ce36b..0d9797994f14 100644 --- a/test/subgraph-tester.h +++ b/test/subgraph-tester.h @@ -283,8 +283,8 @@ class SubgraphTester { } SubgraphTester& AddConvert(uint32_t input_id, uint32_t output_id) { - const xnn_status status = xnn_define_convert( - subgraph_.get(), input_id, output_id, 0 /* flags */); + const xnn_status status = xnn_define_unary( + subgraph_.get(), xnn_unary_convert, /*params=*/nullptr, input_id, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); return *this; } @@ -357,8 +357,11 @@ class SubgraphTester { } SubgraphTester& AddClamp(float output_min, float output_max, uint32_t input_id, uint32_t output_id) { + xnn_unary_params params; + params.clamp.min = output_min; + params.clamp.max = output_max; const xnn_status status = - xnn_define_clamp(subgraph_.get(), output_min, output_max, input_id, output_id, 0 /* flags */); + xnn_define_unary(subgraph_.get(), xnn_unary_clamp, ¶ms, input_id, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); return *this; @@ -456,15 +459,17 @@ class SubgraphTester { SubgraphTester& AddHardSwish(uint32_t input_id, uint32_t output_id) { const xnn_status status = - xnn_define_hardswish(subgraph_.get(), input_id, output_id, 0 /* flags */); + xnn_define_unary(subgraph_.get(), xnn_unary_hardswish, nullptr, input_id, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); return *this; } SubgraphTester& AddLeakyRelu(float negative_slope, uint32_t input_id, uint32_t output_id) { + xnn_unary_params params; + params.leaky_relu.negative_slope = negative_slope; const xnn_status status = - xnn_define_leaky_relu(subgraph_.get(), negative_slope, input_id, output_id, 0 /* flags */); + xnn_define_unary(subgraph_.get(), xnn_unary_leaky_relu, ¶ms, input_id, output_id, 0 /* flags */); EXPECT_EQ(status, xnn_status_success); return *this; @@ -498,7 +503,7 @@ class SubgraphTester { } SubgraphTester& AddPrelu(uint32_t input_id, uint32_t slope_id, uint32_t output_id) { - const xnn_status status = xnn_define_prelu(subgraph_.get(), input_id, slope_id, output_id, /*flags=*/0); + const xnn_status status = xnn_define_binary(subgraph_.get(), xnn_binary_prelu, nullptr, input_id, slope_id, output_id, /*flags=*/0); EXPECT_EQ(status, xnn_status_success); return *this; diff --git a/test/tanh-nc.cc b/test/tanh-nc.cc deleted file mode 100644 index 97f513dc35fc..000000000000 --- a/test/tanh-nc.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include -#include -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class TanhOperatorTester : public UnaryOperatorTester { - public: - TanhOperatorTester() : UnaryOperatorTester() { - range_f32_ = {-10.0f, 10.0f}; - range_f16_ = {-5.0f, 5.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::tanh(x); } - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. Note that for `fp16` - // tests, both `y` and `y_ref` will be converted to `float` for the tolerance - // evaluation. - float AbsTolF32(float) const override { return 5e-6f; } - float AbsTolF16(float y_ref) const override { - return std::max(1.0e-4f, std::abs(y_ref) * 5.0e-3f); - } - float AbsTolQS8(float) const override { return 0.6f; }; - float AbsTolQU8(float) const override { return 0.6f; }; - - CREATE_OP_OVERRIDES_F32(tanh); - CREATE_OP_OVERRIDES_F16(tanh); - CREATE_OP_OVERRIDES_QS8(tanh); - CREATE_OP_OVERRIDES_QU8(tanh); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, TanhOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, TanhOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, TanhOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -CREATE_UNARY_QUANTIZED_TESTS(QS8, TanhOperatorTester); -CREATE_UNARY_QUANTIZED_TESTS(QU8, TanhOperatorTester); - -}; // namespace xnnpack diff --git a/test/tanh-operator-tester.h b/test/tanh-operator-tester.h deleted file mode 100644 index ec5dc2eaf24f..000000000000 --- a/test/tanh-operator-tester.h +++ /dev/null @@ -1,381 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "replicable_random_device.h" - -class TanhOperatorTester { - public: - TanhOperatorTester& channels(size_t channels) { - assert(channels != 0); - this->channels_ = channels; - return *this; - } - - size_t channels() const { - return this->channels_; - } - - TanhOperatorTester& input_stride(size_t input_stride) { - assert(input_stride != 0); - this->input_stride_ = input_stride; - return *this; - } - - size_t input_stride() const { - if (this->input_stride_ == 0) { - return this->channels_; - } else { - assert(this->input_stride_ >= this->channels_); - return this->input_stride_; - } - } - - TanhOperatorTester& output_stride(size_t output_stride) { - assert(output_stride != 0); - this->output_stride_ = output_stride; - return *this; - } - - size_t output_stride() const { - if (this->output_stride_ == 0) { - return this->channels_; - } else { - assert(this->output_stride_ >= this->channels_); - return this->output_stride_; - } - } - - TanhOperatorTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - this->batch_size_ = batch_size; - return *this; - } - - size_t batch_size() const { - return this->batch_size_; - } - - TanhOperatorTester& input_scale(float input_scale) { - assert(input_scale > 0.0f); - assert(std::isnormal(input_scale)); - this->input_scale_ = input_scale; - return *this; - } - - float input_scale() const { - return this->input_scale_; - } - - TanhOperatorTester& input_zero_point(uint8_t input_zero_point) { - this->input_zero_point_ = input_zero_point; - return *this; - } - - uint8_t input_zero_point() const { - return this->input_zero_point_; - } - - float output_scale() const { - return 1.0f / 128.0f; - } - - uint8_t output_zero_point() const { - return 128; - } - - TanhOperatorTester& qmin(uint8_t qmin) { - this->qmin_ = qmin; - return *this; - } - - uint8_t qmin() const { - return this->qmin_; - } - - TanhOperatorTester& qmax(uint8_t qmax) { - this->qmax_ = qmax; - return *this; - } - - uint8_t qmax() const { - return this->qmax_; - } - - TanhOperatorTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void TestF16() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-5.0f, 5.0f); - - xnnpack::Buffer input((batch_size() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float x = input[i * input_stride() + c]; - output_ref[i * channels() + c] = std::tanh(x); - } - } - - // Create, setup, run, and destroy Sigmoid operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t tanh_op = nullptr; - - const xnn_status status = xnn_create_tanh_nc_f16( - 0, &tanh_op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, tanh_op); - - // Smart pointer to automatically delete tanh_op. - std::unique_ptr auto_tanh_op(tanh_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_f16(tanh_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_f16(tanh_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(tanh_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - ASSERT_NEAR( - output[i * output_stride() + c], - output_ref[i * channels() + c], - std::max(1.0e-4f, std::abs(output_ref[i * channels() + c]) * 5.0e-3f)); - } - } - } - } - - void TestF32() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-10.0f, 10.0f); - - xnnpack::Buffer input((batch_size() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const double x = input[i * input_stride() + c]; - output_ref[i * channels() + c] = std::tanh(x); - } - } - - // Create, setup, run, and destroy Tanh operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t tanh_op = nullptr; - - xnn_status status = xnn_create_tanh_nc_f32( - 0, &tanh_op); - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, tanh_op); - - // Smart pointer to automatically delete tanh_op. - std::unique_ptr auto_tanh_op(tanh_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_f32(tanh_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_f32(tanh_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(tanh_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - ASSERT_NEAR( - output[i * output_stride() + c], - output_ref[i * channels() + c], - 5.0e-6); - } - } - } - } - - void TestRunF32() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-10.0f, 10.0f); - - xnnpack::Buffer input((batch_size() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const double x = input[i * input_stride() + c]; - output_ref[i * channels() + c] = std::tanh(x); - } - } - - ASSERT_EQ(xnn_status_success, - xnn_run_tanh_nc_f32( - channels(), - input_stride(), - output_stride(), - batch_size(), - input.data(), output.data(), - 0, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - ASSERT_NEAR( - output[i * output_stride() + c], - output_ref[i * channels() + c], - 5.0e-6); - } - } - } - } - - void TestQS8() const { - xnnpack::ReplicableRandomDevice rng; - - xnnpack::Buffer input((batch_size() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float x = input_scale() * - (int32_t(input[i * input_stride() + c]) - int32_t(input_zero_point() - 0x80)); - const float tanh_x = std::tanh(x); - const float scaled_tanh_x = tanh_x / output_scale(); - float y = scaled_tanh_x; - y = std::min(y, int32_t(qmax() - 0x80) - int32_t(output_zero_point() - 0x80)); - y = std::max(y, int32_t(qmin() - 0x80) - int32_t(output_zero_point() - 0x80)); - output_ref[i * channels() + c] = y + int32_t(output_zero_point() - 0x80); - } - } - - // Create, setup, run, and destroy Sigmoid operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t tanh_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_tanh_nc_qs8( - int8_t(input_zero_point() - 0x80), input_scale(), - int8_t(output_zero_point() - 0x80), output_scale(), - int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), - 0, &tanh_op)); - ASSERT_NE(nullptr, tanh_op); - - // Smart pointer to automatically delete tanh_op. - std::unique_ptr auto_tanh_op(tanh_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_qs8(tanh_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_qs8(tanh_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(tanh_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_NEAR(float(int32_t(output[i * output_stride() + c])), output_ref[i * channels() + c], 0.6f); - } - } - } - } - - void TestQU8() const { - xnnpack::ReplicableRandomDevice rng; - - xnnpack::Buffer input((batch_size() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float x = input_scale() * - (int32_t(input[i * input_stride() + c]) - int32_t(input_zero_point())); - const float tanh_x = std::tanh(x); - const float scaled_tanh_x = tanh_x / output_scale(); - float y = scaled_tanh_x; - y = std::min(y, int32_t(qmax()) - int32_t(output_zero_point())); - y = std::max(y, int32_t(qmin()) - int32_t(output_zero_point())); - output_ref[i * channels() + c] = y + int32_t(output_zero_point()); - } - } - - // Create, setup, run, and destroy Sigmoid operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t tanh_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_tanh_nc_qu8( - input_zero_point(), input_scale(), - output_zero_point(), output_scale(), - qmin(), qmax(), - 0, &tanh_op)); - ASSERT_NE(nullptr, tanh_op); - - // Smart pointer to automatically delete tanh_op. - std::unique_ptr auto_tanh_op(tanh_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_qu8(tanh_op, batch_size(), - channels(), input_stride(), output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_qu8(tanh_op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(tanh_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - EXPECT_NEAR(float(int32_t(output[i * output_stride() + c])), output_ref[i * channels() + c], 0.6f); - } - } - } - } - - private: - size_t batch_size_{1}; - size_t channels_{1}; - size_t input_stride_{0}; - size_t output_stride_{0}; - float input_scale_{0.75f}; - uint8_t input_zero_point_{121}; - uint8_t qmin_{0}; - uint8_t qmax_{255}; - size_t iterations_{15}; -}; diff --git a/test/tanh.cc b/test/tanh.cc deleted file mode 100644 index 241f6bdd1b53..000000000000 --- a/test/tanh.cc +++ /dev/null @@ -1,404 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/node-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/subgraph.h" -#include "subgraph-unary-tester.h" - -using TanhTestQS8 = UnaryTest; -using TanhTestQU8 = UnaryTest; -using TanhTestF16 = UnaryTest; -using TanhTestF32 = UnaryTest; - -TEST_F(TanhTestQS8, define) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = 0; - const float output_scale = 0x1.0p-7f; - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_tanh); - ASSERT_EQ(node->compute_type, xnn_compute_type_qs8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(TanhTestQU8, define) -{ - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = 128; - const float output_scale = 0x1.0p-7f; - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, 0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, 1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_tanh); - ASSERT_EQ(node->compute_type, xnn_compute_type_qu8); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(TanhTestF16, define) -{ - std::uniform_real_distribution f32dist(-10.0f, 10.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_tanh); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp16); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(TanhTestF32, define) -{ - std::uniform_real_distribution f32dist(-10.0f, 10.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - - ASSERT_EQ(subgraph->num_nodes, 1); - const struct xnn_node* node = &subgraph->nodes[0]; - ASSERT_EQ(node->type, xnn_node_type_tanh); - ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); - ASSERT_EQ(node->num_inputs, 1); - ASSERT_EQ(node->inputs[0], input_id); - ASSERT_EQ(node->num_outputs, 1); - ASSERT_EQ(node->outputs[0], output_id); - ASSERT_EQ(node->flags, 0); -} - -TEST_F(TanhTestQS8, matches_operator_api) -{ - const int32_t input_zero_point = i8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = 0; - const float output_scale = 0x1.0p-7f; - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_tanh_nc_qs8( - input_zero_point, input_scale, output_zero_point, output_scale, INT8_MIN, - INT8_MAX, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_qs8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_qs8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_qint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(TanhTestQU8, matches_operator_api) -{ - - const int32_t input_zero_point = u8dist(rng); - const float input_scale = scale_dist(rng); - const int32_t output_zero_point = 128; - const float output_scale = 0x1.0p-7f; - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_tanh_nc_qu8( - input_zero_point, input_scale, output_zero_point, output_scale, 0, - UINT8_MAX, /*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_qu8(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_qu8(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, input_zero_point, input_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/0, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_quantized_tensor_value( - subgraph, xnn_datatype_quint8, output_zero_point, output_scale, dims.size(), dims.data(), - nullptr, /*external_id=*/1, /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(TanhTestF16, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-10.0f, 10.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_tanh_nc_f16(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_f16(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_f16(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp16, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} - -TEST_F(TanhTestF32, matches_operator_api) -{ - std::uniform_real_distribution f32dist(-10.0f, 10.0f); - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - // Call operator API. - xnn_operator_t op = nullptr; - const xnn_status status = xnn_create_tanh_nc_f32(/*flags=*/0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - std::unique_ptr auto_op(op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, xnn_reshape_tanh_nc_f32(op, batch_size, channels, channels, channels, /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, xnn_setup_tanh_nc_f32(op, input.data(), operator_output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Call subgraph API. - xnn_subgraph_t subgraph = nullptr; - ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); - std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); - input_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/0, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); - ASSERT_NE(input_id, XNN_INVALID_NODE_ID); - - output_id = XNN_INVALID_NODE_ID; - ASSERT_EQ( - xnn_status_success, xnn_define_tensor_value( - subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, /*external_id=*/1, - /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); - ASSERT_NE(output_id, XNN_INVALID_NODE_ID); - - xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_tanh(subgraph, input_id, output_id, /*flags=*/0)); - ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); - ASSERT_NE(nullptr, runtime); - std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); - std::array external = { - xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; - ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); - ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); - - ASSERT_EQ(subgraph_output, operator_output); -} diff --git a/test/truncation-nc.cc b/test/truncation-nc.cc deleted file mode 100644 index 5307249524ee..000000000000 --- a/test/truncation-nc.cc +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - - -#include - -#include "unary-operator-tester.h" - -namespace xnnpack { - -class TruncationOperatorTester : public UnaryOperatorTester { - public: - TruncationOperatorTester() : UnaryOperatorTester() { - range_f32_ = {0.0f, 5.0f}; - range_f16_ = {0.0f, 5.0f}; - } - - protected: - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - float RefFunc(float x) const override { return std::trunc(x); } - - CREATE_OP_OVERRIDES_F32(truncation); - CREATE_OP_OVERRIDES_F16(truncation); -}; - -CREATE_UNARY_FLOAT_TESTS(F32, TruncationOperatorTester); -CREATE_UNARY_FLOAT_TESTS(RunF32, TruncationOperatorTester); -#ifndef XNN_EXCLUDE_F16_TESTS -CREATE_UNARY_FLOAT_TESTS(F16, TruncationOperatorTester); -#endif // XNN_EXCLUDE_F16_TESTS - -}; // namespace xnnpack diff --git a/test/u32-f32-vcvt.cc b/test/u32-f32-vcvt.cc index 48ffb59b3ca1..26aa7cda5f7f 100644 --- a/test/u32-f32-vcvt.cc +++ b/test/u32-f32-vcvt.cc @@ -2,22 +2,18 @@ // // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/u32-f32-vcvt.yaml -// Generator: tools/generate-vcvt-test.py #include "xnnpack/microparams-init.h" #include "xnnpack/vcvt.h" -#include "vcvt-microkernel-tester.h" +#include "vunary-microkernel-tester.h" -#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ - datatype_in, datatype_out, params_type, init_params) \ -XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params);\ -XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); \ -XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, datatype_in, datatype_out, ukernel, init_params); +#define XNN_CVT_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, \ + datatype_in, datatype_out, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } #include "u32-f32-vcvt/u32-f32-vcvt.h" #undef XNN_CVT_UKERNEL_WITH_PARAMS diff --git a/test/u8-vclamp.cc b/test/u8-vclamp.cc index ebcec214a40a..85a32d1fe697 100644 --- a/test/u8-vclamp.cc +++ b/test/u8-vclamp.cc @@ -24,15 +24,50 @@ #include "next_prime.h" #include "vunary-microkernel-tester.h" -#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params)\ - \ -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ - \ -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); \ -XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ukernel, init_params); +using TestInfo = Clamp; + +#define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) \ + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); }\ + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } \ + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } \ +TEST(ukernel, clamp_min) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t min = 1; min < 255; min = xnnpack::NextPrime(min)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = min; \ + params.clamp.max = 255; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} \ + \ +TEST(ukernel, clamp_max) { \ + TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ + const size_t batch_scale = get_batch_scale(); \ + const size_t batch_end = batch_tile * batch_scale; \ + const size_t batch_step = \ + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ + for (size_t max = 1; max < 255; max = xnnpack::NextPrime(max)) { \ + for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ + batch_size += batch_step) { \ + xnn_unary_params params; \ + params.clamp.min = 0; \ + params.clamp.max = max; \ + VUnaryMicrokernelTester() \ + .batch_size(batch_size) \ + .Test(ukernel, init_params, params); \ + } \ + } \ +} #include "u8-vclamp/u8-vclamp.h" #undef XNN_UKERNEL_WITH_PARAMS diff --git a/test/unary-elementwise-nc.cc b/test/unary-elementwise-nc.cc new file mode 100644 index 000000000000..fb7b836d3eeb --- /dev/null +++ b/test/unary-elementwise-nc.cc @@ -0,0 +1,440 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/log.h" +#include "xnnpack/math.h" +#include "xnnpack/operator.h" +#include "replicable_random_device.h" +#include "unary-ops.h" +#include "pthreadpool.h" + +enum class RunMode { + kCreateReshapeRun, + kEager, +}; + +struct UnaryOpTestParams { + UnaryOpTestParams(std::string test_name_, size_t batch_size_, + size_t channels_) + : test_name(test_name_), batch_size(batch_size_), channels(channels_) {} + + static UnaryOpTestParams UnitBatch() { + return UnaryOpTestParams("unit_batch", 1, 100); + } + static UnaryOpTestParams SmallBatch() { + return UnaryOpTestParams("small_batch", 3, 100); + } + static UnaryOpTestParams StridedBatch() { + return UnaryOpTestParams("strided_batch", 3, 100) + .InputStride(129) + .OutputStride(117); + } + UnaryOpTestParams& BatchSize(size_t batch_size) { + this->batch_size = batch_size; + return *this; + } + UnaryOpTestParams& Channels(size_t channels) { + this->channels = channels; + return *this; + } + UnaryOpTestParams& Iterations(size_t iterations) { + this->iterations = iterations; + return *this; + } + UnaryOpTestParams& InputStride(size_t input_stride) { + this->input_stride = input_stride; + return *this; + } + UnaryOpTestParams& OutputStride(size_t output_stride) { + this->output_stride = output_stride; + return *this; + } + UnaryOpTestParams& InputQuantization( + const xnn_quantization_params& input_quantization) { + this->input_quantization = input_quantization; + return *this; + } + UnaryOpTestParams& OutputQuantization( + const xnn_quantization_params& output_quantization) { + this->output_quantization = output_quantization; + return *this; + } + + std::string test_name; + size_t batch_size; + size_t iterations = 3; + size_t channels = 100; + size_t input_stride = 0; + size_t output_stride = 0; + xnn_quantization_params input_quantization = {0, 1.0f}; + xnn_quantization_params output_quantization = {0, 1.0f}; +}; + +struct Param { + using UnaryT = std::tuple; + using ConvertT = + std::tuple; + + explicit Param(UnaryT p) + : unary_operator(std::get<0>(p)), + input_datatype(std::get<1>(p)), + output_datatype(std::get<1>(p)), + run_mode(std::get<2>(p)) {} + explicit Param(ConvertT p) + : unary_operator(std::get<0>(p)), + input_datatype(std::get<1>(p)), + output_datatype(std::get<2>(p)), + run_mode(std::get<3>(p)) {} + + std::string Name() const { + std::stringstream sstr; + sstr << xnn_unary_operator_to_string(unary_operator) << "_" + << xnn_datatype_to_string(input_datatype); + if (input_datatype != output_datatype) { + sstr << "_" << xnn_datatype_to_string(output_datatype); + } + if (run_mode == RunMode::kCreateReshapeRun) { + sstr << "_CreateReshapeRun"; + } else if (run_mode == RunMode::kEager) { + sstr << "_Eager"; + } + std::string s = sstr.str(); + // Test names must be alphanumeric with no spaces + std::replace(s.begin(), s.end(), ' ', '_'); + std::replace(s.begin(), s.end(), '(', '_'); + std::replace(s.begin(), s.end(), ')', '_'); + return s; + } + + xnn_unary_operator unary_operator; + xnn_datatype input_datatype; + xnn_datatype output_datatype; + RunMode run_mode; +}; + +// These template parameters only exist to allow us to instantiate a subset of +// the test suite at a time. We only want to try to run the quantized tests for +// datatypes that are actually quantized. +template +class UnaryNCTestT : public testing::TestWithParam { + public: + xnnpack::ReplicableRandomDevice rng_; + + template + void RunUnaryTest(const UnaryOpTestParams& test_params, const Param& param, + bool eager) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + const xnn_unary_operator unary_op = param.unary_operator; + const xnn_datatype input_datatype = param.input_datatype; + const xnn_datatype output_datatype = param.output_datatype; + + const UnaryOpInfo* op_info = GetUnaryOpInfo(unary_op); + const xnn_unary_params op_params = op_info->DefaultParams(); + xnn_quantization_params input_quantization = + InputQuantized ? test_params.input_quantization + : op_info->InputQuantizationParams(input_datatype); + xnn_quantization_params output_quantization = + OutputQuantized ? test_params.output_quantization + : op_info->OutputQuantizationParams(output_datatype); + op_info->InputQuantizationParams(input_datatype); + + Interval domain = op_info->Domain(input_datatype); + + const size_t batch_size = test_params.batch_size; + const size_t iterations = test_params.iterations; + const size_t channels = test_params.channels; + const size_t input_stride = + test_params.input_stride == 0 ? channels : test_params.input_stride; + const size_t output_stride = + test_params.output_stride == 0 ? channels : test_params.output_stride; + xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(In) + + (batch_size - 1) * input_stride + channels); + xnnpack::Buffer output((batch_size - 1) * output_stride + channels); + xnnpack::Buffer output_ref(batch_size * channels); + for (size_t iteration = 0; iteration < iterations; iteration++) { + for (size_t i = 0; i < batch_size; i++) { + FillRandom(rng_, input.data() + i * input_stride, channels, domain, + input_quantization); + + // Compute reference results. + UnaryReferenceImpl(input.data() + i * input_stride, channels, + output_ref.data() + i * channels, *op_info, + input_quantization, output_quantization, op_params); + } + + if (eager) { + xnn_status status = xnn_run_unary_elementwise_nc( + unary_op, input_datatype, output_datatype, &op_params, + &input_quantization, &output_quantization, + /*flags=*/0, batch_size, channels, input_stride, output_stride, + /*threadpool=*/nullptr, input.data(), output.data()); + if (status == xnn_status_unsupported_parameter) { + GTEST_SKIP(); + return; + } + ASSERT_EQ(xnn_status_success, status); + } else { + xnn_operator_t op = nullptr; + xnn_status status = xnn_create_unary_elementwise_nc( + unary_op, input_datatype, output_datatype, &op_params, + &input_quantization, &output_quantization, + /*flags=*/0, &op); + if (status == xnn_status_unsupported_parameter) { + GTEST_SKIP(); + return; + } + ASSERT_EQ(xnn_status_success, status); + ASSERT_NE(nullptr, op); + + // Smart pointer to automatically delete op. + std::unique_ptr auto_op( + op, xnn_delete_operator); + + ASSERT_EQ(xnn_status_success, + xnn_reshape_unary_elementwise_nc(op, batch_size, channels, + input_stride, output_stride, + /*threadpool=*/nullptr)); + ASSERT_EQ(xnn_status_success, xnn_setup_unary_elementwise_nc( + op, input.data(), output.data())); + ASSERT_EQ(xnn_status_success, + xnn_run_operator(op, /*threadpool=*/nullptr)); + } + + // Verify results. + for (size_t i = 0; i < batch_size; i++) { + for (size_t c = 0; c < channels; c++) { + const float y = output[i * output_stride + c]; + const float y_ref = output_ref[i * channels + c]; + ASSERT_NEAR(y, y_ref, op_info->Tolerance(y_ref, output_datatype)); + } + } + } + } + + template + void RunUnaryTest(const UnaryOpTestParams& test_params, const Param& param, + bool eager) { + switch (param.output_datatype) { + case xnn_datatype_fp16: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_fp32: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_int32: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_quint8: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_qint8: + RunUnaryTest(test_params, param, eager); + break; + default: + XNN_UNREACHABLE; + } + } + + void RunUnaryTest(const UnaryOpTestParams& test_params, const Param& param, + bool eager) { + switch (param.input_datatype) { + case xnn_datatype_fp16: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_fp32: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_int32: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_quint8: + RunUnaryTest(test_params, param, eager); + break; + case xnn_datatype_qint8: + RunUnaryTest(test_params, param, eager); + break; + default: + XNN_UNREACHABLE; + } + } +}; + +using UnaryNCTest = UnaryNCTestT<>; +using UnaryNCTest_InputQuantized = + UnaryNCTestT; +using UnaryNCTest_OutputQuantized = + UnaryNCTestT; + +TEST_P(UnaryNCTest, UnitBatch) { + for (size_t c = 0; c < 100; c += 15) { + RunUnaryTest(UnaryOpTestParams::UnitBatch().Channels(c), GetParam(), + /*eager=*/false); + } +} + +TEST_P(UnaryNCTest, SmallBatch) { + for (size_t c = 0; c < 100; c += 15) { + RunUnaryTest(UnaryOpTestParams::SmallBatch().Channels(c), GetParam(), + /*eager=*/false); + } +} + +TEST_P(UnaryNCTest, SmallBatch_InputStride) { + for (size_t c = 0; c < 100; c += 15) { + RunUnaryTest(UnaryOpTestParams::UnitBatch().Channels(c).InputStride(129), + GetParam(), /*eager=*/false); + } +} + +TEST_P(UnaryNCTest, UnitBatch_OutputStride) { + for (size_t c = 0; c < 100; c += 15) { + RunUnaryTest(UnaryOpTestParams::UnitBatch().Channels(c).OutputStride(117), + GetParam(), /*eager=*/false); + } +} + +TEST_P(UnaryNCTest, StridedBatch) { + for (size_t c = 0; c < 100; c += 15) { + RunUnaryTest(UnaryOpTestParams::StridedBatch().Channels(c), GetParam(), + /*eager=*/false); + } +} + +std::vector ZeroPoints(xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_qint8: + return {-128, -127, -1, 0, 1, 126, 127}; + case xnn_datatype_quint8: + return {0, 1, 127, 128, 129, 254, 255}; + default: + XNN_UNREACHABLE; + } +} + +TEST_P(UnaryNCTest_InputQuantized, InputQuantized) { + for (int zero_point : ZeroPoints(GetParam().input_datatype)) { + for (float scale : {1.0e-2f, 1.0e2f, 10.0f}) { + RunUnaryTest( + UnaryOpTestParams::UnitBatch().InputQuantization({zero_point, scale}), + GetParam(), + /*eager=*/false); + } + } +} + +TEST_P(UnaryNCTest_OutputQuantized, OutputQuantized) { + for (int zero_point : ZeroPoints(GetParam().output_datatype)) { + for (float scale : {1.0e-2f, 1.0e2f, 10.0f}) { + RunUnaryTest(UnaryOpTestParams::UnitBatch().OutputQuantization( + {zero_point, scale}), + GetParam(), + /*eager=*/false); + } + } +} + +xnn_unary_operator all_unary_ops[] = { + xnn_unary_clamp, + xnn_unary_abs, + xnn_unary_bankers_rounding, + xnn_unary_ceiling, + xnn_unary_elu, + xnn_unary_exp, + xnn_unary_floor, + xnn_unary_gelu, + xnn_unary_hardswish, + xnn_unary_leaky_relu, + xnn_unary_log, + xnn_unary_negate, + xnn_unary_sigmoid, + xnn_unary_square, + xnn_unary_square_root, + xnn_unary_reciprocal_square_root, + xnn_unary_tanh, +}; + +xnn_datatype all_datatypes[] = { + xnn_datatype_quint8, xnn_datatype_qint8, xnn_datatype_fp16, + xnn_datatype_fp32, xnn_datatype_int32, +}; + +xnn_datatype quantized_datatypes[] = { + xnn_datatype_quint8, + xnn_datatype_qint8, +}; + +xnn_datatype unquantized_datatypes[] = { + xnn_datatype_fp16, + xnn_datatype_fp32, + xnn_datatype_int32, +}; + +RunMode run_modes[] = {RunMode::kCreateReshapeRun, RunMode::kEager}; + +// Run non-quantized tests on all unary ops and all datatypes. +INSTANTIATE_TEST_SUITE_P(UnaryNCTest, UnaryNCTest, + testing::ConvertGenerator( + testing::Combine(testing::ValuesIn(all_unary_ops), + testing::ValuesIn(all_datatypes), + testing::ValuesIn(run_modes))), + [](const auto& info) { return info.param.Name(); }); + +// Run quantized input and output tests on all unary ops and all quantized +// datatypes. +INSTANTIATE_TEST_SUITE_P( + UnaryNCTest_InputQuantized, UnaryNCTest_InputQuantized, + testing::ConvertGenerator(testing::Combine( + testing::ValuesIn(all_unary_ops), + testing::ValuesIn(quantized_datatypes), testing::ValuesIn(run_modes))), + [](const auto& info) { return info.param.Name(); }); + +INSTANTIATE_TEST_SUITE_P( + UnaryNCTest_OutputQuantized, UnaryNCTest_OutputQuantized, + testing::ConvertGenerator(testing::Combine( + testing::ValuesIn(all_unary_ops), + testing::ValuesIn(quantized_datatypes), testing::ValuesIn(run_modes))), + [](const auto& info) { return info.param.Name(); }); + +// Run non-quantized tests all all possible convert datatype combinations. +INSTANTIATE_TEST_SUITE_P( + ConvertNCTest, UnaryNCTest, + testing::ConvertGenerator(testing::Combine( + testing::Values(xnn_unary_convert), testing::ValuesIn(all_datatypes), + testing::ValuesIn(all_datatypes), testing::ValuesIn(run_modes))), + [](const auto& info) { return info.param.Name(); }); + +// Run quantized input conversions. +INSTANTIATE_TEST_SUITE_P( + ConvertNCTest_InputQuantized, UnaryNCTest_InputQuantized, + testing::ConvertGenerator(testing::Combine( + testing::Values(xnn_unary_convert), + testing::ValuesIn(quantized_datatypes), + testing::ValuesIn(all_datatypes), testing::ValuesIn(run_modes))), + [](const auto& info) { return info.param.Name(); }); + +// Run quantized output conversions. +INSTANTIATE_TEST_SUITE_P( + ConvertNCTest_OutputQuantized, UnaryNCTest_OutputQuantized, + testing::ConvertGenerator(testing::Combine( + testing::Values(xnn_unary_convert), testing::ValuesIn(all_datatypes), + testing::ValuesIn(quantized_datatypes), testing::ValuesIn(run_modes))), + [](const auto& info) { return info.param.Name(); }); diff --git a/test/unary-operator-tester.cc b/test/unary-operator-tester.cc deleted file mode 100644 index 83044400c0ab..000000000000 --- a/test/unary-operator-tester.cc +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "unary-operator-tester.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/buffer.h" -#include "xnnpack/math.h" -#include "replicable_random_device.h" - -namespace xnnpack { - -void UnaryOperatorTester::TestF16() { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(range_f16_.first, - range_f16_.second); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(xnn_float16) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + - channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), - [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = - RefFunc(input[i * input_stride() + c]); - } - } - - // Create, setup, run, and destroy Square operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - xnn_operator_t op = nullptr; - - const xnn_status status = CreateOpF16(0, &op); - if (status == xnn_status_unsupported_hardware) { - GTEST_SKIP(); - } - ASSERT_EQ(xnn_status_success, status); - ASSERT_NE(nullptr, op); - - // Smart pointer to automatically delete op. - std::unique_ptr auto_op( - op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - ReshapeOpF16(op, batch_size(), channels(), input_stride(), - output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, SetupOpF16(op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float y = output[i * output_stride() + c]; - const float y_ref = output_ref[i * channels() + c]; - CheckResultF16(y, y_ref, i, c, input[i * input_stride() + c]); - } - } - } -} - -void UnaryOperatorTester::TestF32() { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(range_f32_.first, - range_f32_.second); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = RefFunc(input[i * input_stride() + c]); - } - } - - // Create, setup, run, and destroy Square operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - xnn_operator_t op = nullptr; - - ASSERT_EQ(xnn_status_success, CreateOpF32(0, &op)); - ASSERT_NE(nullptr, op); - - // Smart pointer to automatically delete op. - std::unique_ptr auto_op( - op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - ReshapeOpF32(op, batch_size(), channels(), input_stride(), - output_stride(), /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, SetupOpF32(op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float y = output[i * output_stride() + c]; - const float y_ref = output_ref[i * channels() + c]; - CheckResultF32(y, y_ref, i, c, input[i * input_stride() + c]); - } - } - } -} - -void UnaryOperatorTester::TestRunF32() { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(range_f32_.first, - range_f32_.second); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + - (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - output_ref[i * channels() + c] = RefFunc(input[i * input_stride() + c]); - } - } - - // Initialize and run Square Root operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); - - ASSERT_EQ( - xnn_status_success, - RunOpF32(channels(), input_stride(), output_stride(), batch_size(), - input.data(), output.data(), 0, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float y = output[i * output_stride() + c]; - const float y_ref = output_ref[i * channels() + c]; - CheckResultF32(y, y_ref, i, c, input[i * input_stride() + c]); - } - } - } -} - -void UnaryOperatorTester::TestQS8() { - xnnpack::ReplicableRandomDevice rng; - auto i8rng = [&]() -> int8_t { - return std::uniform_int_distribution(range_qs8_.first, - range_qs8_.second)(rng); - }; - - xnnpack::Buffer input((batch_size() - 1) * input_stride() + channels() + - XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), i8rng); - - // Compute reference results, which are stored as un-truncated quantized - // values. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float x = FloatFromInputQS8(input[i * input_stride() + c]); - const float ref_x = RefFunc(x); - output_ref[i * channels() + c] = QuantizeAsFloatQS8(ref_x); - } - } - - // Create, setup, run, and destroy the operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t op = nullptr; - - ASSERT_EQ(xnn_status_success, - CreateOpQS8( - static_cast(input_zero_point() - 0x80), input_scale(), - static_cast(output_zero_point() - 0x80), - output_scale(), static_cast(qmin() - 0x80), - static_cast(qmax() - 0x80), /*flags=*/0, &op)); - ASSERT_NE(nullptr, op); - - // Smart pointer to automatically delete `op`. - std::unique_ptr auto_op( - op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, ReshapeOpQS8(op, batch_size(), channels(), - input_stride(), output_stride(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, SetupOpQS8(op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const int8_t y = output[i * output_stride() + c]; - const float y_ref = output_ref[i * channels() + c]; - CheckResultQS8(y, y_ref, i, c, input[i * input_stride() + c]); - } - } - } -} - -void UnaryOperatorTester::TestQU8() { - xnnpack::ReplicableRandomDevice rng; - auto u8rng = [&]() -> uint8_t { - return std::uniform_int_distribution(range_qu8_.first, - range_qu8_.second)(rng); - }; - - xnnpack::Buffer input((batch_size() - 1) * input_stride() + channels() + - XNN_EXTRA_BYTES / sizeof(uint8_t)); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + - channels()); - xnnpack::Buffer output_ref(batch_size() * channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), u8rng); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const float x = FloatFromInputQU8(input[i * input_stride() + c]); - const float ref_x = RefFunc(x); - output_ref[i * channels() + c] = QuantizeAsFloatQU8(ref_x); - } - } - - // Create, setup, run, and destroy the operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t op = nullptr; - - ASSERT_EQ( - xnn_status_success, - CreateOpQU8(input_zero_point(), input_scale(), output_zero_point(), - output_scale(), qmin(), qmax(), /*flags=*/0, &op)); - ASSERT_NE(nullptr, op); - - // Smart pointer to automatically delete `op`. - std::unique_ptr auto_op( - op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, ReshapeOpQU8(op, batch_size(), channels(), - input_stride(), output_stride(), - /*threadpool=*/nullptr)); - ASSERT_EQ(xnn_status_success, SetupOpQU8(op, input.data(), output.data())); - ASSERT_EQ(xnn_status_success, xnn_run_operator(op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t c = 0; c < channels(); c++) { - const uint8_t y = output[i * output_stride() + c]; - const float y_ref = output_ref[i * channels() + c]; - CheckResultQU8(y, y_ref, i, c, input[i * input_stride() + c]); - } - } - } -} - -}; // namespace xnnpack diff --git a/test/unary-operator-tester.h b/test/unary-operator-tester.h deleted file mode 100644 index 4b8715a24344..000000000000 --- a/test/unary-operator-tester.h +++ /dev/null @@ -1,719 +0,0 @@ -// Copyright 2024 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#ifndef __XNNPACK_TEST_UNARY_OPERATOR_TESTER_H_ -#define __XNNPACK_TEST_UNARY_OPERATOR_TESTER_H_ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "pthreadpool.h" - -namespace xnnpack { - -class UnaryOperatorTester { - public: - virtual ~UnaryOperatorTester() = default; - - UnaryOperatorTester& channels(size_t channels) { - assert(channels != 0); - channels_ = channels; - return *this; - } - - UnaryOperatorTester& input_stride(size_t input_stride) { - assert(input_stride != 0); - input_stride_ = input_stride; - return *this; - } - - UnaryOperatorTester& output_stride(size_t output_stride) { - assert(output_stride != 0); - output_stride_ = output_stride; - return *this; - } - - UnaryOperatorTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - batch_size_ = batch_size; - return *this; - } - - UnaryOperatorTester& iterations(size_t iterations) { - iterations_ = iterations; - return *this; - } - - UnaryOperatorTester& input_scale(float input_scale) { - assert(input_scale > 0.0f); - assert(std::isnormal(input_scale)); - input_scale_ = input_scale; - return *this; - } - - UnaryOperatorTester& input_zero_point(int16_t input_zero_point) { - input_zero_point_ = input_zero_point; - return *this; - } - - UnaryOperatorTester& output_scale(float output_scale) { - assert(output_scale > 0.0f); - assert(std::isnormal(output_scale)); - output_scale_ = output_scale; - return *this; - } - - UnaryOperatorTester& output_zero_point(int16_t output_zero_point) { - output_zero_point_ = output_zero_point; - return *this; - } - - UnaryOperatorTester& qmin(int16_t qmin) { - qmin_ = qmin; - return *this; - } - - UnaryOperatorTester& qmax(int16_t qmax) { - qmax_ = qmax; - return *this; - } - - size_t channels() const { return channels_; } - - size_t input_stride() const { - if (input_stride_ == 0) { - return channels_; - } else { - assert(input_stride_ >= channels_); - return input_stride_; - } - } - - size_t output_stride() const { - if (output_stride_ == 0) { - return channels_; - } else { - assert(output_stride_ >= channels_); - return output_stride_; - } - } - - size_t batch_size() const { return batch_size_; } - - size_t iterations() const { return iterations_; } - - float input_scale() const { return input_scale_; } - - int16_t input_zero_point() const { return input_zero_point_; } - - float output_scale() const { return output_scale_; } - - int16_t output_zero_point() const { return output_zero_point_; } - - int16_t qmin() const { return qmin_; } - - int16_t qmax() const { return qmax_; } - - // Converters between float and quantized types. - float FloatFromInputQS8(int8_t x) const { - return input_scale() * (static_cast(x) - - static_cast(input_zero_point() - 0x80)); - } - float FloatFromInputQU8(uint8_t x) const { - return input_scale() * - (static_cast(x) - static_cast(input_zero_point())); - } - float QuantizeAsFloatQS8(float x) const { - float y = - x / output_scale() + static_cast(output_zero_point() - 0x80); - y = std::min(y, qmax() - 0x80); - y = std::max(y, qmin() - 0x80); - return y; - } - float QuantizeAsFloatQU8(float x) const { - float y = x / output_scale() + static_cast(output_zero_point()); - y = std::min(y, qmax()); - y = std::max(y, qmin()); - return y; - } - - virtual void TestF16(); - virtual void TestF32(); - virtual void TestRunF32(); - virtual void TestQS8(); - virtual void TestQU8(); - - protected: - UnaryOperatorTester() = default; - - // Computes the expected result for some input `x`. Subclasses should override - // this function with their own reference function. - virtual float RefFunc(float x) const = 0; - - // Computes the absolute tolerance for a reference value `y_ref`. Tests will - // fail when `std::abs(y - y_ref) > AbsTol32(y_ref)`. - // Note that for `fp16` tests, both `y` and `y_ref` will be converted to - // `float` for the tolerance evaluation. - virtual float AbsTolF32(float y_ref) const { return 0.0f; }; - virtual float AbsTolF16(float y_ref) const { return 0.0f; }; - - // For the `QSU` and `QU8` tests, `y_ref` is the reference value transformed - // to the quantization range, e.g. `[qmin(), qmax()]` for `QU8` (see - // `QuantizeAsFloatQS8` and `QuantizeAsFloatQU8`). - virtual float AbsTolQS8(float y_ref) const { return 0.0f; }; - virtual float AbsTolQU8(float y_ref) const { return 0.0f; }; - - // Check the results for each datatype. Override these functions to perform - // additional checks. - virtual void CheckResultF32(float y, float y_ref, size_t batch, - size_t channel, float input) const { - EXPECT_NEAR(y_ref, y, AbsTolF32(y_ref)) - << "at batch " << batch << " / " << batch_size() << ", channel " - << channel << " / " << channels() << ", input " << input; - } - virtual void CheckResultF16(xnn_float16 y, float y_ref, size_t batch, - size_t channel, xnn_float16 input) const { - EXPECT_NEAR(y_ref, y, AbsTolF16(y_ref)) - << "at batch " << batch << " / " << batch_size() << ", channel " - << channel << " / " << channels() << ", input " - << input; - } - virtual void CheckResultQS8(int8_t y, float y_ref, size_t batch, - size_t channel, int8_t input) const { - EXPECT_NEAR(y_ref, static_cast(y), AbsTolQS8(y_ref)) - << "at batch " << batch << " / " << batch_size() << ", channel " - << channel << " / " << channels() << ", input " - << static_cast(input) << " (" << FloatFromInputQS8(input) - << ")"; - } - virtual void CheckResultQU8(uint8_t y, float y_ref, size_t batch, - size_t channel, uint8_t input) const { - EXPECT_NEAR(y_ref, static_cast(y), AbsTolQU8(y_ref)) - << "at batch " << batch << " / " << batch_size() << ", channel " - << channel << " / " << channels() << ", input " - << static_cast(input) << " (" << FloatFromInputQU8(input) - << ")"; - } - - // Wrappers for the create/reshape/setup/run functions of the underlying `f32` - // op, override these with calls to the actual op functions, e.g. using the - // `CREATE_OP_OVERRIDES_F32` macro defined below. - virtual xnn_status CreateOpF32(uint32_t flags, xnn_operator_t* op_out) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status ReshapeOpF32(xnn_operator_t op, size_t batch_size, - size_t channels, size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status SetupOpF32(xnn_operator_t op, const float* input, - float* output) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status RunOpF32(size_t channels, size_t input_stride, - size_t output_stride, size_t batch_size, - const float* input, float* output, uint32_t flags, - pthreadpool_t threadpool) const { - return xnn_status_invalid_parameter; - } - - // Wrappers for the create/reshape/setup functions of the underlying `f16` - // op, override these with calls to the actual op functions, e.g. using the - // `CREATE_OP_OVERRIDES_F16` macro defined below. - virtual xnn_status CreateOpF16(uint32_t flags, xnn_operator_t* op_out) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status ReshapeOpF16(xnn_operator_t op, size_t batch_size, - size_t channels, size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status SetupOpF16(xnn_operator_t op, const void* input, - void* output) const { - return xnn_status_invalid_parameter; - } - - // Wrappers for the create/reshape/setup functions of the underlying `qs8` - // op, override these with calls to the actual op functions, e.g. using the - // `CREATE_OP_OVERRIDES_QS8` macro defined below. - virtual xnn_status CreateOpQS8(int8_t input_zero_point, float input_scale, - int8_t output_zero_point, float output_scale, - int8_t output_min, int8_t output_max, - uint32_t flags, xnn_operator_t* op_out) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status ReshapeOpQS8(xnn_operator_t op, size_t batch_size, - size_t channels, size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status SetupOpQS8(xnn_operator_t op, const int8_t* input, - int8_t* output) const { - return xnn_status_invalid_parameter; - } - - // Wrappers for the create/reshape/setup functions of the underlying `qu8` - // op, override these with calls to the actual op functions, e.g. using the - // `CREATE_OP_OVERRIDES_QU8` macro defined below. - virtual xnn_status CreateOpQU8(uint8_t input_zero_point, float input_scale, - uint8_t output_zero_point, float output_scale, - uint8_t output_min, uint8_t output_max, - uint32_t flags, xnn_operator_t* op_out) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status ReshapeOpQU8(xnn_operator_t op, size_t batch_size, - size_t channels, size_t input_stride, - size_t output_stride, - pthreadpool_t threadpool) const { - return xnn_status_invalid_parameter; - } - virtual xnn_status SetupOpQU8(xnn_operator_t op, const uint8_t* input, - uint8_t* output) const { - return xnn_status_invalid_parameter; - } - - // Input ranges for the different type-dependent tests. - std::pair range_f32_ = {-10.0f, 10.0f}; - std::pair range_f16_ = {-10.0f, 10.0f}; - std::pair range_qs8_ = {std::numeric_limits::min(), - std::numeric_limits::max()}; - std::pair range_qu8_ = { - 0, std::numeric_limits::max()}; - - private: - size_t batch_size_ = 1; - size_t channels_ = 1; - size_t input_stride_ = 0; - size_t output_stride_ = 0; - float input_scale_ = 0.75f; - int16_t input_zero_point_ = 121; - float output_scale_ = 1.0f / 128.0f; - int16_t output_zero_point_ = 128; - int16_t qmin_ = 0; - int16_t qmax_ = 255; - size_t iterations_ = 15; -}; - -#define CREATE_OP_CREATE_OVERRIDE_F32(op_name) \ - xnn_status CreateOpF32(uint32_t flags, xnn_operator_t* op_out) \ - const override { \ - return xnn_create_##op_name##_nc_f32(flags, op_out); \ - } - -#define CREATE_OP_RESHAPE_OVERRIDE_F32(op_name) \ - xnn_status ReshapeOpF32(xnn_operator_t op, size_t batch_size, \ - size_t channels, size_t input_stride, \ - size_t output_stride, pthreadpool_t threadpool) \ - const override { \ - return xnn_reshape_##op_name##_nc_f32( \ - op, batch_size, channels, input_stride, output_stride, threadpool); \ - } - -#define CREATE_OP_SETUP_OVERRIDE_F32(op_name) \ - xnn_status SetupOpF32(xnn_operator_t op, const float* input, float* output) \ - const override { \ - return xnn_setup_##op_name##_nc_f32(op, input, output); \ - } - -#define CREATE_OP_RUN_OVERRIDE_F32(op_name) \ - xnn_status RunOpF32(size_t channels, size_t input_stride, \ - size_t output_stride, size_t batch_size, \ - const float* input, float* output, uint32_t flags, \ - pthreadpool_t threadpool) const override { \ - return xnn_run_##op_name##_nc_f32(channels, input_stride, output_stride, \ - batch_size, input, output, flags, \ - threadpool); \ - } - -#define CREATE_STANDARD_OP_OVERRIDES_F32(op_name) \ - CREATE_OP_CREATE_OVERRIDE_F32(op_name); \ - CREATE_OP_RESHAPE_OVERRIDE_F32(op_name); \ - CREATE_OP_SETUP_OVERRIDE_F32(op_name); - -#define CREATE_OP_OVERRIDES_F32(op_name) \ - CREATE_OP_CREATE_OVERRIDE_F32(op_name); \ - CREATE_OP_RESHAPE_OVERRIDE_F32(op_name); \ - CREATE_OP_SETUP_OVERRIDE_F32(op_name); \ - CREATE_OP_RUN_OVERRIDE_F32(op_name); - -#define CREATE_OP_CREATE_OVERRIDE_F16(op_name) \ - xnn_status CreateOpF16(uint32_t flags, xnn_operator_t* op_out) \ - const override { \ - return xnn_create_##op_name##_nc_f16(flags, op_out); \ - } - -#define CREATE_OP_RESHAPE_OVERRIDE_F16(op_name) \ - xnn_status ReshapeOpF16(xnn_operator_t op, size_t batch_size, \ - size_t channels, size_t input_stride, \ - size_t output_stride, pthreadpool_t threadpool) \ - const override { \ - return xnn_reshape_##op_name##_nc_f16( \ - op, batch_size, channels, input_stride, output_stride, threadpool); \ - } - -#define CREATE_OP_SETUP_OVERRIDE_F16(op_name) \ - xnn_status SetupOpF16(xnn_operator_t op, const void* input, void* output) \ - const override { \ - return xnn_setup_##op_name##_nc_f16(op, input, output); \ - } - -#define CREATE_OP_OVERRIDES_F16(op_name) \ - CREATE_OP_CREATE_OVERRIDE_F16(op_name); \ - CREATE_OP_RESHAPE_OVERRIDE_F16(op_name); \ - CREATE_OP_SETUP_OVERRIDE_F16(op_name); - -#define CREATE_OP_CREATE_OVERRIDE_QS8(op_name) \ - xnn_status CreateOpQS8(int8_t input_zero_point, float input_scale, \ - int8_t output_zero_point, float output_scale, \ - int8_t output_min, int8_t output_max, uint32_t flags, \ - xnn_operator_t* op_out) const override { \ - return xnn_create_##op_name##_nc_qs8( \ - input_zero_point, input_scale, output_zero_point, output_scale, \ - output_min, output_max, flags, op_out); \ - } - -#define CREATE_OP_RESHAPE_OVERRIDE_QS8(op_name) \ - xnn_status ReshapeOpQS8(xnn_operator_t op, size_t batch_size, \ - size_t channels, size_t input_stride, \ - size_t output_stride, pthreadpool_t threadpool) \ - const override { \ - return xnn_reshape_##op_name##_nc_qs8( \ - op, batch_size, channels, input_stride, output_stride, threadpool); \ - } - -#define CREATE_OP_SETUP_OVERRIDE_QS8(op_name) \ - xnn_status SetupOpQS8(xnn_operator_t op, const int8_t* input, \ - int8_t* output) const override { \ - return xnn_setup_##op_name##_nc_qs8(op, input, output); \ - } - -#define CREATE_OP_OVERRIDES_QS8(op_name) \ - CREATE_OP_CREATE_OVERRIDE_QS8(op_name); \ - CREATE_OP_RESHAPE_OVERRIDE_QS8(op_name); \ - CREATE_OP_SETUP_OVERRIDE_QS8(op_name); - -#define CREATE_OP_CREATE_OVERRIDE_QU8(op_name) \ - xnn_status CreateOpQU8( \ - uint8_t input_zero_point, float input_scale, uint8_t output_zero_point, \ - float output_scale, uint8_t output_min, uint8_t output_max, \ - uint32_t flags, xnn_operator_t* op_out) const override { \ - return xnn_create_##op_name##_nc_qu8( \ - input_zero_point, input_scale, output_zero_point, output_scale, \ - output_min, output_max, flags, op_out); \ - } - -#define CREATE_OP_RESHAPE_OVERRIDE_QU8(op_name) \ - xnn_status ReshapeOpQU8(xnn_operator_t op, size_t batch_size, \ - size_t channels, size_t input_stride, \ - size_t output_stride, pthreadpool_t threadpool) \ - const override { \ - return xnn_reshape_##op_name##_nc_qu8( \ - op, batch_size, channels, input_stride, output_stride, threadpool); \ - } - -#define CREATE_OP_SETUP_OVERRIDE_QU8(op_name) \ - xnn_status SetupOpQU8(xnn_operator_t op, const uint8_t* input, \ - uint8_t* output) const override { \ - return xnn_setup_##op_name##_nc_qu8(op, input, output); \ - } - -#define CREATE_OP_OVERRIDES_QU8(op_name) \ - CREATE_OP_CREATE_OVERRIDE_QU8(op_name); \ - CREATE_OP_RESHAPE_OVERRIDE_QU8(op_name); \ - CREATE_OP_SETUP_OVERRIDE_QU8(op_name); - -template -struct LoopLimits { - T min; - T max; - T stride; - std::string ToString() const { - return "[" + std::to_string(min) + ":" + std::to_string(stride) + ":" + - std::to_string(max) + "]"; - } -}; - -// Mimics the behaviour of `std::optional`, which is only available as of C++17. -template -class Optional { - public: - Optional() = default; - explicit Optional(T value) : has_value_(true), value_(value) {} - Optional& operator=(const T& other) { - has_value_ = true; - value_ = other; - return *this; - } - - // Clears the value if it was set. - void reset() { - if (has_value_) { - value_ = T(); - has_value_ = false; - } - } - - // Accessors to check whether a value has been set. - bool has_value() const { return has_value_; } - explicit operator bool() const { return has_value_; } - - // Accessors to access the value. - T& value() { return value_; } - const T& value() const { return value_; } - T& operator*() { return value_; } - const T& operator*() const { return value_; } - T* operator->() { return &value_; } - const T* operator->() const { return &value_; } - - private: - bool has_value_ = false; - T value_; -}; - -struct UnaryOpTestParams { - UnaryOpTestParams(std::string test_name_, size_t batch_size_, - LoopLimits channels_) - : test_name(test_name_), batch_size(batch_size_), channels(channels_) {} - - static UnaryOpTestParams UnitBatch() { - return UnaryOpTestParams("unit_batch", 1, LoopLimits{1, 100, 15}); - } - static UnaryOpTestParams SmallBatch() { - return UnaryOpTestParams("small_batch", 3, LoopLimits{1, 100, 15}); - } - static UnaryOpTestParams StridedBatch() { - return UnaryOpTestParams("strided_batch", 3, LoopLimits{1, 100, 15}) - .InputStride(129) - .OutputStride(117); - } - UnaryOpTestParams& BatchSize(size_t batch_size) { - this->batch_size = batch_size; - return *this; - } - UnaryOpTestParams& Channels(LoopLimits channels) { - this->channels = channels; - return *this; - } - UnaryOpTestParams& Iterations(size_t iterations) { - this->iterations = iterations; - return *this; - } - UnaryOpTestParams& InputStride(size_t input_stride) { - this->input_stride = input_stride; - return *this; - } - UnaryOpTestParams& OutputStride(size_t output_stride) { - this->output_stride = output_stride; - return *this; - } - UnaryOpTestParams& Qmin(uint8_t qmin) { - this->qmin = qmin; - return *this; - } - UnaryOpTestParams& Qmax(uint8_t qmax) { - this->qmax = qmax; - return *this; - } - UnaryOpTestParams& InputScale(LoopLimits input_scale) { - this->input_scale = input_scale; - return *this; - } - UnaryOpTestParams& InputZeroPoint(LoopLimits input_zero_point) { - this->input_zero_point = input_zero_point; - return *this; - } - - std::string ToString() const { - std::string result = test_name; - if (input_stride && !output_stride) { - result += "_with_input_stride"; - } else if (!input_stride && output_stride) { - result += "_with_output_stride"; - } - if (qmin) { - result += "_with_qmin"; - } - if (qmax) { - result += "_with_qmax"; - } - if (input_zero_point) { - result += "_with_input_zero_point"; - } - if (input_scale) { - result += "_with_input_scale"; - } - return result; - } - - std::string test_name; - size_t batch_size; - size_t iterations = 3; - LoopLimits channels; - Optional input_stride; - Optional output_stride; - Optional qmin; - Optional qmax; - Optional> input_scale; - Optional> input_zero_point; -}; - -inline std::ostream& operator<<(std::ostream& os, UnaryOpTestParams params) { - os << "{test_name: '" << params.test_name - << "', batch_size: " << params.batch_size; - if (params.input_stride) { - os << ", input_stride: " << *params.input_stride; - } - if (params.output_stride) { - os << ", output_stride: " << *params.output_stride; - } - if (params.qmin) { - os << ", qmin: " << *params.qmin; - } - if (params.qmax) { - os << ", qmax: " << *params.qmax; - } - if (params.input_scale) { - os << ", input_scale: " << params.input_scale->ToString(); - } - if (params.input_zero_point) { - os << ", input_zero_point: " << params.input_zero_point->ToString(); - } - return os << "}"; -} - -#define CREATE_UNARY_TEST(datatype, Tester) \ - using Tester##datatype = testing::TestWithParam; \ - TEST_P(Tester##datatype, Test##datatype) { \ - const UnaryOpTestParams& test_case = GetParam(); \ - for (size_t channels = test_case.channels.min; \ - channels <= test_case.channels.max; \ - channels += test_case.channels.stride) { \ - LoopLimits input_scale_limits{1, 2, 2}; \ - if (test_case.input_scale) { \ - input_scale_limits = *test_case.input_scale; \ - } \ - for (float input_scale = input_scale_limits.min; \ - input_scale <= input_scale_limits.max; \ - input_scale *= input_scale_limits.stride) { \ - LoopLimits input_zero_point_limits{0, 1, 1}; \ - if (test_case.input_zero_point) { \ - input_zero_point_limits = *test_case.input_zero_point; \ - } \ - for (int32_t input_zero_point = input_zero_point_limits.min; \ - input_zero_point <= input_zero_point_limits.max; \ - input_zero_point += input_zero_point_limits.stride) { \ - Tester tester; \ - tester.batch_size(test_case.batch_size) \ - .channels(channels) \ - .iterations(test_case.iterations); \ - if (test_case.input_stride) { \ - tester.input_stride(*test_case.input_stride); \ - } \ - if (test_case.output_stride) { \ - tester.output_stride(*test_case.output_stride); \ - } \ - if (test_case.qmin) { \ - tester.qmin(*test_case.qmin); \ - } \ - if (test_case.qmax) { \ - tester.qmax(*test_case.qmax); \ - } \ - if (test_case.input_scale) { \ - tester.input_scale(input_scale); \ - } \ - if (test_case.input_zero_point) { \ - tester.input_zero_point(input_zero_point); \ - } \ - tester.Test##datatype(); \ - } \ - } \ - } \ - } - -#define CREATE_UNARY_FLOAT_TESTS(datatype, Tester) \ - CREATE_UNARY_TEST(datatype, Tester) \ - INSTANTIATE_TEST_SUITE_P( \ - datatype, Tester##datatype, \ - testing::ValuesIn({ \ - UnaryOpTestParams::UnitBatch(), \ - UnaryOpTestParams::SmallBatch(), \ - UnaryOpTestParams::SmallBatch().InputStride(129), \ - UnaryOpTestParams::SmallBatch().OutputStride(117), \ - UnaryOpTestParams::StridedBatch(), \ - }), \ - [](const testing::TestParamInfo& info) { \ - return info.param.ToString(); \ - }); - -#define CREATE_UNARY_QUANTIZED_TESTS(datatype, Tester) \ - CREATE_UNARY_TEST(datatype, Tester) \ - INSTANTIATE_TEST_SUITE_P( \ - datatype, Tester##datatype, \ - testing::ValuesIn({ \ - UnaryOpTestParams::UnitBatch(), \ - UnaryOpTestParams::UnitBatch().Qmin(128), \ - UnaryOpTestParams::UnitBatch().Qmax(128), \ - UnaryOpTestParams::UnitBatch().InputScale({1.0e-2f, 50.0f, 10.0f}), \ - UnaryOpTestParams::UnitBatch().InputZeroPoint({0, 255, 51}), \ - UnaryOpTestParams::SmallBatch(), \ - UnaryOpTestParams::SmallBatch().InputStride(129), \ - UnaryOpTestParams::SmallBatch().OutputStride(117), \ - UnaryOpTestParams::SmallBatch().Qmin(128), \ - UnaryOpTestParams::SmallBatch().Qmax(128), \ - UnaryOpTestParams::SmallBatch().InputScale({1.0e-2f, 50.0f, 10.0f}), \ - UnaryOpTestParams::SmallBatch().InputZeroPoint({0, 255, 51}), \ - UnaryOpTestParams::StridedBatch(), \ - UnaryOpTestParams::StridedBatch().Qmin(128), \ - UnaryOpTestParams::StridedBatch().Qmax(128), \ - UnaryOpTestParams::StridedBatch().InputScale( \ - {1.0e-2f, 50.0f, 10.0f}), \ - UnaryOpTestParams::StridedBatch().InputZeroPoint({0, 255, 51}), \ - }), \ - [](const testing::TestParamInfo& info) { \ - return info.param.ToString(); \ - }); - -#define CREATE_UNARY_QUANTIZED_TESTS_NO_QMIN(datatype, Tester) \ - CREATE_UNARY_TEST(datatype, Tester) \ - INSTANTIATE_TEST_SUITE_P( \ - datatype, Tester##datatype, \ - testing::ValuesIn({ \ - UnaryOpTestParams::UnitBatch(), \ - UnaryOpTestParams::UnitBatch().InputScale({1.0e-2f, 50.0f, 10.0f}), \ - UnaryOpTestParams::UnitBatch().InputZeroPoint({0, 255, 51}), \ - UnaryOpTestParams::SmallBatch(), \ - UnaryOpTestParams::SmallBatch().InputStride(129), \ - UnaryOpTestParams::SmallBatch().OutputStride(117), \ - UnaryOpTestParams::SmallBatch().InputScale({1.0e-2f, 50.0f, 10.0f}), \ - UnaryOpTestParams::SmallBatch().InputZeroPoint({0, 255, 51}), \ - UnaryOpTestParams::StridedBatch(), \ - UnaryOpTestParams::StridedBatch().InputScale( \ - {1.0e-2f, 50.0f, 10.0f}), \ - UnaryOpTestParams::StridedBatch().InputZeroPoint({0, 255, 51}), \ - }), \ - [](const testing::TestParamInfo& info) { \ - return info.param.ToString(); \ - }); - -}; // namespace xnnpack - -#endif // __XNNPACK_TEST_UNARY_OPERATOR_TESTER_H_ diff --git a/test/unary-ops.cc b/test/unary-ops.cc new file mode 100644 index 000000000000..38a7d46f9ee3 --- /dev/null +++ b/test/unary-ops.cc @@ -0,0 +1,70 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include "xnnpack.h" +#include "xnnpack/common.h" +#include "unary-ops.h" + +const UnaryOpInfo* GetUnaryOpInfo(xnn_unary_operator op) { + static Abs abs; + static Clamp clamp; + static Convert convert; + static ELU elu; + static Exp exp; + static GELU gelu; + static HardSwish hardswish; + static LeakyReLU leaky_relu; + static Log log; + static Negate negate; + static ReciprocalSquareRoot reciprocal_square_root; + static Sigmoid sigmoid; + static Square square; + static SquareRoot square_root; + static TanH tanh; + static RoundToNearestEven bankers_rounding; + static RoundUp ceiling; + static RoundDown floor; + + switch (op) { + case xnn_unary_abs: + return &abs; + case xnn_unary_bankers_rounding: + return &bankers_rounding; + case xnn_unary_ceiling: + return &ceiling; + case xnn_unary_clamp: + return &clamp; + case xnn_unary_convert: + return &convert; + case xnn_unary_elu: + return &elu; + case xnn_unary_exp: + return &exp; + case xnn_unary_floor: + return &floor; + case xnn_unary_gelu: + return &gelu; + case xnn_unary_hardswish: + return &hardswish; + case xnn_unary_leaky_relu: + return &leaky_relu; + case xnn_unary_log: + return &log; + case xnn_unary_negate: + return &negate; + case xnn_unary_reciprocal_square_root: + return &reciprocal_square_root; + case xnn_unary_sigmoid: + return &sigmoid; + case xnn_unary_square: + return □ + case xnn_unary_square_root: + return &square_root; + case xnn_unary_tanh: + return &tanh; + default: + XNN_UNREACHABLE; + } +} \ No newline at end of file diff --git a/test/unary-ops.h b/test/unary-ops.h new file mode 100644 index 000000000000..b326bb5bea7a --- /dev/null +++ b/test/unary-ops.h @@ -0,0 +1,472 @@ +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef THIRD_PARTY_XNNPACK_TEST_UNARY_OPS_H_ +#define THIRD_PARTY_XNNPACK_TEST_UNARY_OPS_H_ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xnnpack.h" +#include "xnnpack/math.h" + +static float TolExact(float) { return 0.0f; } +static float TolExact16(float y_ref) { return std::abs(y_ref) * 1.0e-3f; } + +static float TolRelative(float y_ref, float rel_tol) { + // Note that `y_ref * rel_tol`, i.e. the expected absolute difference, + // may round differently than `y_ref * (1 + rel_tol) - y_ref`, i.e. the + // effective absolute difference computed in `float`s. We therefore use + // the latter form since it is the true difference between two `float`s + // within the given relative tolerance. + return std::abs(y_ref * (1.0f + rel_tol)) - std::abs(y_ref); +} + +static float TolMixed(float y_ref, float abs_tol, float rel_tol) { + return std::max(abs_tol, + std::abs(y_ref) * (1.0f + rel_tol) - std::abs(y_ref)); +} + +struct Interval { + float min; + float max; + + static Interval All() { + return {-std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; + } + + static Interval Positive(xnn_datatype datatype) { + switch (datatype) { + case xnn_datatype_fp16: + return {0.001f, std::numeric_limits::infinity()}; + case xnn_datatype_fp32: + return {std::numeric_limits::epsilon(), + std::numeric_limits::infinity()}; + default: + return {1.0f, std::numeric_limits::infinity()}; + } + } +}; + +// This struct describes a unary operator enough such that we can test them +// without knowing anything about the specific operator. +struct UnaryOpInfo { + virtual ~UnaryOpInfo() = default; + + virtual float ReferenceImpl(float x, + const xnn_unary_params& params) const = 0; + + // Get the parameters to use by default for this operator. + virtual xnn_unary_params DefaultParams() const { return xnn_unary_params(); } + + // Compute the tolerance for error given the reference result and the + // datatype. + virtual float Tolerance(float y_ref, xnn_datatype datatype) const { + switch (datatype) { + case xnn_datatype_qint8: + case xnn_datatype_quint8: + return 1; + case xnn_datatype_fp16: + return TolExact16(y_ref); + default: + return TolExact(y_ref); + } + } + + virtual Interval Domain(xnn_datatype) const { return Interval::All(); } + + // Quantization parameters to use by default. + virtual xnn_quantization_params InputQuantizationParams( + xnn_datatype datatype) const { + switch (datatype) { + case xnn_datatype_quint8: + return {150, 1.0f}; + default: + return {0, 1.0f}; + } + } + virtual xnn_quantization_params OutputQuantizationParams( + xnn_datatype datatype) const { + switch (datatype) { + case xnn_datatype_quint8: + return {100, 1.0f}; + default: + return {0, 1.0f}; + } + } +}; + +struct Convert : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return x; + } +}; + +struct ReLU : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::max(x, 0.0f); + } +}; + +struct Abs : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::abs(x); + } +}; + +struct Negate : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return -x; + } +}; + +struct Clamp : public UnaryOpInfo { + xnn_unary_params DefaultParams() const override { + xnn_unary_params params; + params.clamp.min = -40.0f; + params.clamp.max = 50.0f; + return params; + } + + float ReferenceImpl(float x, const xnn_unary_params& params) const override { + return std::min(std::max(x, params.clamp.min), + params.clamp.max); + } + + xnn_quantization_params InputQuantizationParams( + xnn_datatype datatype) const override { + return {0, 1.0f}; + } + xnn_quantization_params OutputQuantizationParams( + xnn_datatype datatype) const override { + return {0, 1.0f}; + } +}; + +struct ELU : public UnaryOpInfo { + xnn_unary_params DefaultParams() const override { + xnn_unary_params params; + params.elu.alpha = 1.0f; + return params; + } + + float ReferenceImpl(float x, const xnn_unary_params& params) const override { + return std::signbit(x) ? params.elu.alpha * std::expm1(x) : x; + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolMixed(y_ref, 5.0e-6f, 1.0e-5f); + case xnn_datatype_fp16: + return TolMixed(y_ref, 1.0e-4f, 5.0e-3f); + default: + return 1; + } + } + + Interval Domain(xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp16: + return {-9.0f, 9.0f}; + default: + return {-20.0f, 20.0f}; + } + } +}; + +struct GELU : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return x * 0.5f * (1.0f + std::erf(x * std::sqrt(2) / 2)); + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + return TolMixed(y_ref, 10 * std::numeric_limits::epsilon(), + 5 * std::numeric_limits::epsilon()); + } + + Interval Domain(xnn_datatype) const override { return {-10.0f, 10.0f}; } +}; + +struct HardSwish : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return (x / 6.0) * std::max(std::min(x + 3.0, 6.0), 0.0); + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolMixed(y_ref, 5.0e-6f, 1.0e-5f); + case xnn_datatype_fp16: + return TolMixed(y_ref, 1.0e-3f, 1.0e-2f); + case xnn_datatype_qint8: + return 1; + case xnn_datatype_quint8: + return 1; + default: + XNN_UNREACHABLE; + } + } + + Interval Domain(xnn_datatype) const override { return {-4.0f, 4.0f}; } +}; + +struct LeakyReLU : public UnaryOpInfo { + xnn_unary_params DefaultParams() const override { + xnn_unary_params params; + params.leaky_relu.negative_slope = 0.5f; + return params; + } + + float ReferenceImpl(float x, const xnn_unary_params& params) const override { + return std::signbit(x) ? x * params.leaky_relu.negative_slope : x; + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolExact(y_ref); + case xnn_datatype_fp16: + return TolMixed(y_ref, 1.0e-4f, 1.0e-3f); + case xnn_datatype_qint8: + return 1; + case xnn_datatype_quint8: + return 1; + default: + XNN_UNREACHABLE; + } + } +}; + +struct RoundToNearestEven : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::nearbyint(x); + } +}; + +struct RoundTowardsZero : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::trunc(x); + } +}; + +struct RoundUp : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::ceil(x); + } +}; + +struct RoundDown : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::floor(x); + } +}; + +struct Sigmoid : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + if (x > 100) { + return 1.0f; + } else if (x < -100) { + return 0.0f; + } else { + const double e = std::exp(static_cast(x)); + return e / (1.0 + e); + } + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolMixed(y_ref, 5.0e-6f, 1.0e-5f); + case xnn_datatype_fp16: + return TolMixed(y_ref, 1.0e-4f, 5.0e-3f); + default: + return TolExact(y_ref); + } + } + + Interval Domain(xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp16: + return {-25.0f, 25.0f}; + default: + return {-125.0f, 125.0f}; + } + } +}; + +struct Square : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return x * x; + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolExact(y_ref); + case xnn_datatype_fp16: + return TolMixed(y_ref, 1.0e-4f, 5.0e-3f); + default: + return TolExact(y_ref); + } + } +}; + +struct SquareRoot : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::sqrt(x); + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolRelative(y_ref, 2.5f * std::numeric_limits::epsilon()); + case xnn_datatype_fp16: + return TolMixed(y_ref, 1.0e-4f, 5.0e-3f); + default: + return TolExact(y_ref); + } + } + + Interval Domain(xnn_datatype datatype) const override { + return Interval::Positive(datatype); + } +}; + +struct TanH : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::tanh(x); + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolRelative( + y_ref, + 4.0f * std::numeric_limits::epsilon()); // 4 ULP + case xnn_datatype_fp16: + return TolMixed(y_ref, /*abs_tol=*/1.0e-4f, /*rel_tol=*/5.0e-3f); + default: + return TolExact(y_ref); + } + } + + Interval Domain(xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp16: + return {-5.0f, 5.0f}; + default: + return {-10.0f, 10.0f}; + } + } +}; + +struct ReciprocalSquareRoot : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return 1.0 / std::sqrt(x); + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + switch (datatype) { + case xnn_datatype_fp32: + return TolRelative(y_ref, 4 * std::numeric_limits::epsilon()); + case xnn_datatype_fp16: + return TolMixed(y_ref, 1.0e-4f, 5.0e-3f); + default: + return TolExact(y_ref); + } + } + + Interval Domain(xnn_datatype datatype) const override { + return Interval::Positive(datatype); + } +}; + +struct Log : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::log(x); + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + return TolMixed(y_ref, 2 * std::numeric_limits::epsilon(), + 6 * std::numeric_limits::epsilon()); + } + + Interval Domain(xnn_datatype datatype) const override { + return {std::numeric_limits::epsilon(), 1000.0f}; + } +}; + +struct Exp : public UnaryOpInfo { + float ReferenceImpl(float x, const xnn_unary_params&) const override { + return std::exp(x); + } + + float Tolerance(float y_ref, xnn_datatype datatype) const override { + return TolMixed(y_ref, 2 * std::numeric_limits::epsilon(), + 6 * std::numeric_limits::epsilon()); + } + Interval Domain(xnn_datatype) const override { return {-10.0f, 10.0f}; } +}; + +const UnaryOpInfo* GetUnaryOpInfo(xnn_unary_operator op); + +// Generate random data in the given domain, where the domain is given as +// unquantized values. +template +void FillRandom(Rng& rng, T* x, size_t n, const Interval& domain, + const xnn_quantization_params& quantization = {0, 1.0f}) { + float min = domain.min; + float max = domain.max; + min = min * quantization.scale + quantization.zero_point; + max = max * quantization.scale + quantization.zero_point; + min = std::max(domain.min, std::numeric_limits::lowest()); + max = std::min(domain.max, std::numeric_limits::max()); + min = std::max(min, -1e6f); + max = std::min(max, 1e6f); + + std::uniform_real_distribution dist(min, max); + for (size_t i = 0; i < n; ++i) { + x[i] = static_cast(dist(rng)); + } +} + +// Compute the result of a unary operator using the reference implementation. +template +void UnaryReferenceImpl( + const In* x, size_t n, Out* y, const UnaryOp& op_info, + const xnn_quantization_params& input_quantization = {0, 1.0f}, + const xnn_quantization_params& output_quantization = {0, 1.0f}, + const xnn_unary_params& params = xnn_unary_params()) { + for (size_t i = 0; i < n; i++) { + float x_i = static_cast(x[i]); + if (std::is_integral::value) { + x_i = (x_i - input_quantization.zero_point) * input_quantization.scale; + } + float y_i = op_info.ReferenceImpl(x_i, params); + if (std::is_integral::value) { + y_i = y_i / output_quantization.scale + output_quantization.zero_point; + y_i = std::clamp(y_i, std::numeric_limits::min(), + std::numeric_limits::max()); + y[i] = static_cast(std::lrint(y_i)); + } else { + y[i] = y_i; + } + } +} + +#endif // THIRD_PARTY_XNNPACK_TEST_UNARY_OPS_H_ diff --git a/test/unary.cc b/test/unary.cc new file mode 100644 index 000000000000..c71265bba6b8 --- /dev/null +++ b/test/unary.cc @@ -0,0 +1,251 @@ +// Copyright 2020 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/log.h" +#include "xnnpack/math.h" +#include "xnnpack/operator.h" +#include "xnnpack/subgraph.h" +#include "replicable_random_device.h" +#include "unary-ops.h" + +struct Param { + using UnaryT = std::tuple; + using ConvertT = std::tuple; + explicit Param(UnaryT p) + : unary_operator(std::get<0>(p)), + input_datatype(std::get<1>(p)), + output_datatype(std::get<1>(p)) {} + explicit Param(ConvertT p) + : unary_operator(std::get<0>(p)), + input_datatype(std::get<1>(p)), + output_datatype(std::get<2>(p)) {} + + std::string Name() const { + std::stringstream sstr; + sstr << xnn_unary_operator_to_string(unary_operator) << "_" + << xnn_datatype_to_string(input_datatype); + if (input_datatype != output_datatype) { + sstr << "_" << xnn_datatype_to_string(output_datatype); + } + std::string s = sstr.str(); + // Test names must be alphanumeric with no spaces + std::replace(s.begin(), s.end(), ' ', '_'); + std::replace(s.begin(), s.end(), '(', '_'); + std::replace(s.begin(), s.end(), ')', '_'); + return s; + } + + xnn_unary_operator unary_operator; + xnn_datatype input_datatype; + xnn_datatype output_datatype; +}; + +class UnaryTest : public testing::TestWithParam { + public: + xnnpack::ReplicableRandomDevice rng_; +}; + +TEST_P(UnaryTest, matches_operator_api) { + const xnn_unary_operator unary_operator = GetParam().unary_operator; + const xnn_datatype input_datatype = GetParam().input_datatype; + const xnn_datatype output_datatype = GetParam().output_datatype; + + const size_t sizeof_input = xnnpack::datatype_size(input_datatype); + const size_t sizeof_output = xnnpack::datatype_size(output_datatype); + + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + std::uniform_int_distribution<> rank_dist(0, XNN_MAX_TENSOR_DIMS); + std::uniform_int_distribution<> dim_dist(1, 10); + std::vector dims(rank_dist(rng_)); + std::generate(dims.begin(), dims.end(), [&]() { return dim_dist(rng_); }); + + size_t size = + std::accumulate(dims.begin(), dims.end(), 1, std::multiplies()); + size_t channels = dims.back(); + size_t batch_size = size / channels; + + xnnpack::Buffer input(size * sizeof_input + XNN_EXTRA_BYTES); + xnnpack::Buffer subgraph_output(size * sizeof_output); + xnnpack::Buffer operator_output(size * sizeof_output); + + const UnaryOpInfo* op_info = GetUnaryOpInfo(unary_operator); + xnn_unary_params params = op_info->DefaultParams(); + const xnn_quantization_params input_quantization = + op_info->InputQuantizationParams(input_datatype); + const xnn_quantization_params output_quantization = + op_info->OutputQuantizationParams(output_datatype); + + // Call operator API. + const xnn_status status = xnn_run_unary_elementwise_nc( + unary_operator, input_datatype, output_datatype, ¶ms, + &input_quantization, &output_quantization, /*flags=*/0, batch_size, + channels, channels, channels, /*thread_pool=*/nullptr, input.data(), + operator_output.data()); + if (status == xnn_status_unsupported_parameter) { + GTEST_SKIP(); + } + ASSERT_EQ(xnn_status_success, status); + + // Call subgraph API. + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + uint32_t input_id = XNN_INVALID_VALUE_ID; + if (xnnpack::is_quantized(input_datatype)) { + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, input_datatype, input_quantization.zero_point, + input_quantization.scale, dims.size(), dims.data(), nullptr, + /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + } else { + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, input_datatype, dims.size(), + dims.data(), nullptr, /*external_id=*/0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, + &input_id)); + } + ASSERT_NE(input_id, XNN_INVALID_VALUE_ID); + + uint32_t output_id = XNN_INVALID_VALUE_ID; + if (xnnpack::is_quantized(output_datatype)) { + ASSERT_EQ(xnn_status_success, + xnn_define_quantized_tensor_value( + subgraph, output_datatype, output_quantization.zero_point, + output_quantization.scale, dims.size(), dims.data(), nullptr, + /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + } else { + ASSERT_EQ(xnn_status_success, + xnn_define_tensor_value(subgraph, output_datatype, dims.size(), + dims.data(), nullptr, /*external_id=*/1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, + &output_id)); + } + ASSERT_NE(output_id, XNN_INVALID_VALUE_ID); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, + xnn_define_unary(subgraph, unary_operator, ¶ms, input_id, + output_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + std::array external = { + xnn_external_value{input_id, input.data()}, xnn_external_value{output_id, subgraph_output.data()}}; + ASSERT_EQ(xnn_status_success, xnn_setup_runtime(runtime, external.size(), external.data())); + ASSERT_EQ(xnn_status_success, xnn_invoke_runtime(runtime)); + + ASSERT_EQ(subgraph_output, operator_output); +} + +xnn_unary_operator all_unary_ops[] = { + xnn_unary_clamp, + xnn_unary_abs, + xnn_unary_bankers_rounding, + xnn_unary_ceiling, + xnn_unary_elu, + xnn_unary_exp, + xnn_unary_floor, + xnn_unary_gelu, + xnn_unary_hardswish, + xnn_unary_leaky_relu, + xnn_unary_log, + xnn_unary_negate, + xnn_unary_sigmoid, + xnn_unary_square, + xnn_unary_square_root, + xnn_unary_reciprocal_square_root, + xnn_unary_tanh, +}; + +xnn_datatype all_datatypes[] = { + xnn_datatype_quint8, xnn_datatype_qint8, xnn_datatype_fp16, + xnn_datatype_fp32, xnn_datatype_int32, +}; + +INSTANTIATE_TEST_SUITE_P( + UnaryTest, UnaryTest, + testing::ConvertGenerator(testing::Combine( + testing::ValuesIn(all_unary_ops), testing::ValuesIn(all_datatypes))), + [](const auto& info) { return info.param.Name(); }); + +INSTANTIATE_TEST_SUITE_P( + ConvertTest, UnaryTest, + testing::ConvertGenerator(testing::Combine( + testing::Values(xnn_unary_convert), testing::ValuesIn(all_datatypes), + testing::ValuesIn(all_datatypes))), + [](const auto& info) { return info.param.Name(); }); + + +TEST(AbsTest, reshape) { + ASSERT_EQ(xnn_status_success, xnn_initialize(/*allocator=*/nullptr)); + + xnn_subgraph_t subgraph = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_subgraph(/*external_value_ids=*/2, /*flags=*/0, &subgraph)); + std::unique_ptr auto_subgraph(subgraph, xnn_delete_subgraph); + + std::vector dims{2, 3, 4}; + uint32_t input_id = XNN_INVALID_VALUE_ID; + ASSERT_EQ( + xnn_status_success, xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 0, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id)); + ASSERT_NE(input_id, XNN_INVALID_VALUE_ID); + + uint32_t output_id = XNN_INVALID_VALUE_ID; + ASSERT_EQ( + xnn_status_success, xnn_define_tensor_value( + subgraph, xnn_datatype_fp32, dims.size(), dims.data(), nullptr, 1, + /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &output_id)); + ASSERT_NE(output_id, XNN_INVALID_VALUE_ID); + + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_abs, /*params=*/nullptr, input_id, output_id, /*flags=*/0)); + + ASSERT_EQ(subgraph->num_nodes, 1); + struct xnn_node* node = &subgraph->nodes[0]; + ASSERT_EQ(node->type, xnn_node_type_abs); + ASSERT_EQ(node->compute_type, xnn_compute_type_fp32); + ASSERT_EQ(node->num_inputs, 1); + ASSERT_EQ(node->inputs[0], input_id); + ASSERT_EQ(node->num_outputs, 1); + ASSERT_EQ(node->outputs[0], output_id); + ASSERT_EQ(node->flags, 0); + + xnn_runtime_t runtime = nullptr; + ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); + ASSERT_NE(nullptr, runtime); + std::unique_ptr auto_runtime(runtime, xnn_delete_runtime); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], subgraph->values, subgraph->num_values, /*threadpool=*/nullptr), xnn_status_success); + + dims[0] = 7; + ASSERT_EQ(xnn_status_success, xnn_reshape_external_value(runtime, 0, dims.size(), dims.data())); + + ASSERT_EQ(node->reshape(&runtime->opdata[0], runtime->values, runtime->num_values, /*threadpool=*/nullptr), xnn_status_reallocation_required); + const xnn_shape* output_shape = &runtime->values[node->outputs[0]].shape; + const size_t num_input_elements = std::accumulate(dims.cbegin(), dims.cend(), size_t{1}, std::multiplies()); + ASSERT_EQ(output_shape->dim[0], dims[0]); + ASSERT_EQ(runtime->values[node->outputs[0]].size, num_input_elements * sizeof(float)); +} diff --git a/test/vcvt-microkernel-tester.cc b/test/vcvt-microkernel-tester.cc deleted file mode 100644 index d294192cf785..000000000000 --- a/test/vcvt-microkernel-tester.cc +++ /dev/null @@ -1,550 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "vcvt-microkernel-tester.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -void VCvtMicrokernelTester::Test( - xnn_f16_f32_vcvt_ukernel_fn vcvt, const void*) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-100.0f, 100.0f); - - xnnpack::Buffer input(batch_size() + - XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer output(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), - [&]() { return f32dist(rng); }); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(xnn_float16), input.data(), output.data(), nullptr); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - ASSERT_EQ(float_as_uint32(output[i]), - float_as_uint32(input[i])) - << "at " << i << " / " << batch_size() << ", x[" << i << "] = 0x" - << std::hex << std::setw(4) << std::setfill('0') << input[i]; - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_f32_f16_vcvt_ukernel_fn vcvt, const void*) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-100.0f, 100.0f); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer output(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(float), input.data(), output.data(), nullptr); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - ASSERT_EQ(output[i], xnn_float16(input[i])) - << "at " << i << " / " << batch_size() << ", x[" << i << "] = 0x" - << std::hex << std::setw(8) << std::setfill('0') - << float_as_uint32(input[i]) << " (" << input[i] << ")"; - } - } -} - -void VCvtMicrokernelTester::Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt, - xnn_init_f16_qs8_cvt_params_fn init_params) { - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input_float(batch_size()); - xnnpack::Buffer input(batch_size() + - XNN_EXTRA_BYTES / sizeof(xnn_float16)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - const float scale_fp16 = - xnn_float16(scale()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input_float.begin(), input_float.end(), - [&]() { return f32dist(rng); }); - std::copy(input_float.begin(), input_float.end(), input.begin()); - - - struct xnn_f16_qs8_cvt_params params; - init_params(¶ms, scale(), output_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(xnn_float16), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - float scaled_input = input[i] * scale_fp16; - scaled_input = std::min( - scaled_input, static_cast(std::numeric_limits::max() - - output_zero_point())); - scaled_input = std::max( - scaled_input, static_cast(std::numeric_limits::min() - - output_zero_point())); - output_ref[i] = static_cast( - std::lrintf(scaled_input) + static_cast(output_zero_point())); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_NEAR(static_cast(output[i]), - static_cast(output_ref[i]), 1) - << "at " << i << " / " << batch_size() << ", x[" << i << "] = 0x" - << std::hex << std::setw(8) << std::setfill('0') - << float_as_uint32(input[i]) << " (" << input[i] << ")" << " INPUT " - << input[i] << " scale " << scale() << " zp " - << (int)output_zero_point(); - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_f32_qs8_vcvt_ukernel_fn vcvt, - xnn_init_f32_qs8_cvt_params_fn init_params) const { - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - struct xnn_f32_qs8_cvt_params params; - init_params(¶ms, scale(), output_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(float), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - float scaled_input = input[i] * scale(); - scaled_input = std::min( - scaled_input, static_cast(std::numeric_limits::max() - - output_zero_point())); - scaled_input = std::max( - scaled_input, static_cast(std::numeric_limits::min() - - output_zero_point())); - output_ref[i] = static_cast( - std::lrintf(scaled_input) + static_cast(output_zero_point())); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(static_cast(output[i]), - static_cast(output_ref[i])) - << "at " << i << " / " << batch_size() << ", x[" << i << "] = 0x" - << std::hex << std::setw(8) << std::setfill('0') - << float_as_uint32(input[i]) << " (" << input[i] << ")"; - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_f32_qu8_vcvt_ukernel_fn vcvt, - xnn_init_f32_qu8_cvt_params_fn init_params) const { - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(-1.0f, 1.0f); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(float)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - struct xnn_f32_qu8_cvt_params params; - init_params(¶ms, scale(), output_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(float), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - float scaled_input = input[i] * scale(); - scaled_input = std::min( - scaled_input, static_cast(std::numeric_limits::max() - - output_zero_point())); - scaled_input = std::max( - scaled_input, static_cast(std::numeric_limits::min() - - output_zero_point())); - output_ref[i] = static_cast( - std::lrintf(scaled_input) + static_cast(output_zero_point())); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(static_cast(output[i]), - static_cast(output_ref[i])) - << "at " << i << " / " << batch_size() << ", x[" << i << "] = 0x" - << std::hex << std::setw(8) << std::setfill('0') - << float_as_uint32(input[i]) << " (" << input[i] << ")"; - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_s32_f32_vcvt_ukernel_fn vcvt, - xnn_init_s32_f32_cvt_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i32dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int32_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i32dist(rng); }); - - struct xnn_s32_f32_cvt_params params; - init_params(¶ms, static_cast(input_zero_point())); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(int32_t), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - const int32_t zero_point = static_cast(input_zero_point()); - output_ref[i] = static_cast(input[i] - zero_point); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_NEAR(output[i], output_ref[i], std::abs(output_ref[i] * 1e-6f)) - << "at " << i << " / " << batch_size() << ", input = " << input[i]; - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_u32_f32_vcvt_ukernel_fn vcvt, - xnn_init_u32_f32_cvt_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u32dist( - std::numeric_limits::min(), - std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(uint32_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u32dist(rng); }); - - struct xnn_u32_f32_cvt_params params; - init_params(¶ms, static_cast(input_zero_point())); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(uint32_t), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - const int64_t zero_point = static_cast(input_zero_point()); - output_ref[i] = static_cast(static_cast(input[i]) - zero_point); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_NEAR(output[i], output_ref[i], std::abs(output_ref[i] * 1e-6f)) - << "at " << i << " / " << batch_size() << ", input = " << input[i]; - } - } -} - -void VCvtMicrokernelTester::Test(xnn_qs8_vcvt_ukernel_fn vcvt, - xnn_init_qs8_cvt_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - struct xnn_qs8_cvt_params params; - init_params(¶ms, scale(), input_zero_point(), output_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(int8_t), input.data(), output.data(), ¶ms); - - // Compute reference results - const int32_t multiplier = (int32_t)lrintf(-256.0f * scale()); - for (size_t i = 0; i < batch_size(); i++) { - const int32_t input_value = (input_zero_point() - input[i]) * 128; - int32_t output_value = - math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + - output_zero_point(); - output_value = - std::min(output_value, std::numeric_limits::max()); - output_value = - std::max(output_value, std::numeric_limits::min()); - output_ref[i] = static_cast(output_value); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(static_cast(output[i]), - static_cast(output_ref[i])) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << static_cast(input[i]); - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_qs16_qs8_vcvt_ukernel_fn vcvt, - xnn_init_qs16_qs8_cvt_params_fn init_params) const { - ASSERT_EQ(input_zero_point(), 0); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i16dist; - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int16_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i16dist(rng); }); - - struct xnn_qs16_qs8_cvt_params params; - init_params(¶ms, scale(), output_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(int16_t), input.data(), output.data(), ¶ms); - - // Compute reference results - const int64_t multiplier = std::llrintf(65536.0f * scale()); - for (size_t i = 0; i < batch_size(); i++) { - const int64_t input_value = input[i]; - int32_t output_value = - static_cast( - math_asr_s64(input_value * multiplier + INT64_C(0x8000), 16)) + - output_zero_point(); - output_value = - std::min(output_value, std::numeric_limits::max()); - output_value = - std::max(output_value, std::numeric_limits::min()); - output_ref[i] = static_cast(output_value); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(static_cast(output[i]), - static_cast(output_ref[i])) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << input[i] << " * scale " << scale() << " = " - << static_cast(output_ref[i]); - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_qs8_f16_vcvt_ukernel_fn vcvt, - xnn_init_qs8_f16_cvt_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - struct xnn_qs8_f16_cvt_params params; - init_params(¶ms, scale(), - input_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(int8_t), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - output_ref[i] = xnn_float16( - static_cast(static_cast(input[i]) - - input_zero_point()) * - scale()); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(output_ref[i], output[i]) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << static_cast(input[i]); - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_qs8_f32_vcvt_ukernel_fn vcvt, - xnn_init_qs8_f32_cvt_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - struct xnn_qs8_f32_cvt_params params; - init_params(¶ms, scale(), input_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(int8_t), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - output_ref[i] = static_cast(static_cast(input[i]) - - input_zero_point()) * - scale(); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(output[i], output_ref[i]) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << static_cast(input[i]); - } - } -} - -void VCvtMicrokernelTester::Test(xnn_qu8_vcvt_ukernel_fn vcvt, - xnn_init_qu8_cvt_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - struct xnn_qu8_cvt_params params; - init_params(¶ms, scale(), input_zero_point(), output_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(uint8_t), input.data(), output.data(), ¶ms); - - // Compute reference results - const int32_t multiplier = (int32_t)lrintf(-256.0f * scale()); - for (size_t i = 0; i < batch_size(); i++) { - const int32_t input_value = (input_zero_point() - input[i]) * 128; - int32_t output_value = - math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + - output_zero_point(); - output_value = - std::min(output_value, std::numeric_limits::max()); - output_value = - std::max(output_value, std::numeric_limits::min()); - output_ref[i] = static_cast(output_value); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(static_cast(output[i]), - static_cast(output_ref[i])) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << static_cast(input[i]); - } - } -} - -void VCvtMicrokernelTester::Test( - xnn_qu8_f32_vcvt_ukernel_fn vcvt, - xnn_init_qu8_f32_cvt_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - struct xnn_qu8_f32_cvt_params params; - init_params(¶ms, scale(), input_zero_point()); - - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(uint8_t), input.data(), output.data(), ¶ms); - - // Compute reference results - for (size_t i = 0; i < batch_size(); i++) { - output_ref[i] = static_cast(static_cast(input[i]) - - input_zero_point()) * - scale(); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(output[i], output_ref[i]) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << static_cast(input[i]); - } - } -} diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h deleted file mode 100644 index 927f44299570..000000000000 --- a/test/vcvt-microkernel-tester.h +++ /dev/null @@ -1,256 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" - -class VCvtMicrokernelTester { - public: - VCvtMicrokernelTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - this->batch_size_ = batch_size; - return *this; - } - - size_t batch_size() const { return this->batch_size_; } - - VCvtMicrokernelTester& scale(float scale) { - assert(scale > 0.0f); - assert(std::isnormal(scale)); - this->scale_ = scale; - return *this; - } - - float scale() const { return this->scale_; } - - VCvtMicrokernelTester& input_zero_point(int16_t input_zero_point) { - this->input_zero_point_ = input_zero_point; - return *this; - } - - int16_t input_zero_point() const { return this->input_zero_point_; } - - VCvtMicrokernelTester& output_zero_point(int16_t output_zero_point) { - this->output_zero_point_ = output_zero_point; - return *this; - } - - int16_t output_zero_point() const { return this->output_zero_point_; } - - VCvtMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { return this->iterations_; } - - void Test(xnn_f16_f32_vcvt_ukernel_fn vcvt, const void* = nullptr) const; - - void Test(xnn_f32_f16_vcvt_ukernel_fn vcvt, const void* = nullptr) const; - - void Test(xnn_f16_qs8_vcvt_ukernel_fn vcvt, - xnn_init_f16_qs8_cvt_params_fn init_params); - - void Test(xnn_f32_qs8_vcvt_ukernel_fn vcvt, - xnn_init_f32_qs8_cvt_params_fn init_params) const; - - void Test(xnn_f32_qu8_vcvt_ukernel_fn vcvt, - xnn_init_f32_qu8_cvt_params_fn init_params) const; - - void Test(xnn_s32_f32_vcvt_ukernel_fn vcvt, - xnn_init_s32_f32_cvt_params_fn init_params) const; - - void Test(xnn_qs8_vcvt_ukernel_fn vcvt, - xnn_init_qs8_cvt_params_fn init_params) const; - - void Test(xnn_qs16_qs8_vcvt_ukernel_fn vcvt, - xnn_init_qs16_qs8_cvt_params_fn init_params) const; - - void Test(xnn_qs8_f16_vcvt_ukernel_fn vcvt, - xnn_init_qs8_f16_cvt_params_fn init_params) const; - - void Test(xnn_qs8_f32_vcvt_ukernel_fn vcvt, - xnn_init_qs8_f32_cvt_params_fn init_params) const; - - void Test(xnn_qu8_vcvt_ukernel_fn vcvt, - xnn_init_qu8_cvt_params_fn init_params) const; - - void Test(xnn_qu8_f32_vcvt_ukernel_fn vcvt, - xnn_init_qu8_f32_cvt_params_fn init_params) const; - - void Test(xnn_u32_f32_vcvt_ukernel_fn vcvt, - xnn_init_u32_f32_cvt_params_fn init_params) const; - - private: - float scale_ = 1.75f; - int16_t input_zero_point_ = 0; - int16_t output_zero_point_ = 5; - size_t batch_size_ = 1; - size_t iterations_ = 15; -}; - -template -VCvtMicrokernelTester make_vcvt_tester() { - if (std::is_integral::value) { - return VCvtMicrokernelTester() - .output_zero_point(std::numeric_limits::min() / 2 + - std::numeric_limits::max() / 2 + 1); - } else { - return VCvtMicrokernelTester(); - } -} - -#define XNN_TEST_CVT_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, batch_eq) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - make_vcvt_tester() \ - .batch_size(batch_tile* batch_scale) \ - .Test(__VA_ARGS__); \ - } - -#define XNN_TEST_CVT_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, batch_div) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - if (batch_tile == 1 && batch_scale == 1) return; \ - for (size_t batch_size = batch_tile * batch_scale * 2; \ - batch_size < batch_tile * batch_scale * 10; \ - batch_size += batch_tile * batch_scale) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_CVT_BATCH_LT(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, batch_lt) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - if (batch_tile == 1 && batch_scale == 1) return; \ - for (size_t batch_size = batch_scale; \ - batch_size < batch_tile * batch_scale; batch_size++) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_CVT_BATCH_GT(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, batch_gt) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile == 1 ? 10 : batch_tile * 2; \ - const size_t batch_step = batch_scale == 1 ? 1 : batch_tile * 2; \ - for (size_t batch_size = batch_tile + 1; batch_size < batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_CVT_SCALE(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, scale) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale * 5; \ - const size_t batch_step = std::max(2, batch_end / 8) - 1; \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .scale(50) \ - .Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_CVT_INPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, \ - datatype_in, datatype_out, ...) \ - TEST(ukernel, input_zero_point) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale * 5; \ - const size_t batch_step = std::max(2, batch_end / 8) - 1; \ - for (int16_t input_zero_point = 0; input_zero_point < 5; \ - input_zero_point += 2) { \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .input_zero_point(input_zero_point) \ - .Test(__VA_ARGS__); \ - } \ - } \ - } - -#define XNN_TEST_CVT_OUTPUT_ZERO_POINT(ukernel, arch_flags, batch_tile, \ - datatype_in, datatype_out, ...) \ - TEST(ukernel, output_zero_point) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale * 5; \ - const size_t batch_step = std::max(2, batch_end / 8) - 1; \ - for (int16_t output_zero_point = 0; output_zero_point < 5; \ - output_zero_point += 2) { \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .output_zero_point(output_zero_point) \ - .Test(__VA_ARGS__); \ - } \ - } \ - } - -#define XNN_TEST_CVT_SATURATION(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, saturation) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale * 5; \ - const size_t batch_step = std::max(2, batch_end / 8) - 1; \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .scale(500) \ - .Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_CVT_OVERFLOW(ukernel, arch_flags, batch_tile, datatype_in, \ - datatype_out, ...) \ - TEST(ukernel, overflow) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale * 5; \ - const size_t batch_step = std::max(2, batch_end / 8) - 1; \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - make_vcvt_tester() \ - .batch_size(batch_size) \ - .scale(4294967296.0f) \ - .Test(__VA_ARGS__); \ - } \ - } diff --git a/test/vhswish-microkernel-tester.h b/test/vhswish-microkernel-tester.h deleted file mode 100644 index d12b3476c083..000000000000 --- a/test/vhswish-microkernel-tester.h +++ /dev/null @@ -1,266 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -class VHSwishMicrokernelTester { - public: - VHSwishMicrokernelTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - this->batch_size_ = batch_size; - return *this; - } - - size_t batch_size() const { - return this->batch_size_; - } - - VHSwishMicrokernelTester& input_scale(float input_scale) { - assert(input_scale > 0.0f); - assert(std::isnormal(input_scale)); - this->input_scale_ = input_scale; - return *this; - } - - float input_scale() const { - return this->input_scale_; - } - - VHSwishMicrokernelTester& input_zero_point(int16_t input_zero_point) { - this->input_zero_point_ = input_zero_point; - return *this; - } - - int16_t input_zero_point() const { - return this->input_zero_point_; - } - - VHSwishMicrokernelTester& output_scale(float output_scale) { - assert(output_scale > 0.0f); - assert(std::isnormal(output_scale)); - this->output_scale_ = output_scale; - return *this; - } - - float output_scale() const { - return this->output_scale_; - } - - VHSwishMicrokernelTester& output_zero_point(int16_t output_zero_point) { - this->output_zero_point_ = output_zero_point; - return *this; - } - - int16_t output_zero_point() const { - return this->output_zero_point_; - } - - VHSwishMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - VHSwishMicrokernelTester& inplace(bool inplace) { - this->inplace_ = inplace; - return *this; - } - - bool inplace() const { return this->inplace_; } - - void Test(xnn_qs8_vhswish_ukernel_fn vhswish, xnn_init_qs8_hswish_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - union xnn_qs8_hswish_params params; - init_params(¶ms, input_zero_point(), output_zero_point(), input_scale(), output_scale()); - - // Compute reference results - const int32_t input_scale_div = (int32_t) lrintf(256.0f * input_scale() / 6.0f); - const int32_t scale_ratio = (int32_t) lrintf(256.0f * input_scale() / output_scale()); - for (size_t i = 0; i < batch_size(); i++) { - const int32_t input_value = int32_t(uint32_t(input_zero_point() - input[i]) << 7); - int32_t in = input_value * input_scale_div; - in -= 16384; // subtract 0.5 in Q15 - in = std::min(in, 0); - in = std::max(in, -32768); - const int32_t out = math_asr_s32(input_value * scale_ratio + INT32_C(0x4000), 15); - int32_t output_value = math_asr_s32(in * out + INT32_C(0x4000), 15) + output_zero_point(); - output_value = std::min(output_value, std::numeric_limits::max()); - output_value = std::max(output_value, std::numeric_limits::min()); - output_ref[i] = static_cast(output_value); - } - - // Call optimized micro-kernel. - vhswish(batch_size() * sizeof(int8_t), input.data(), inplace() ? input.data() : output.data(), ¶ms); - - if (inplace()) { - std::copy_n(input.data(), batch_size(), output.data()); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i])) - << "at " << i << " / " << batch_size() - << ", x[" << i << "] = " << int32_t(input[i]); - } - } - } - - void Test(xnn_qu8_vhswish_ukernel_fn vhswish, xnn_init_qu8_hswish_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - union xnn_qu8_hswish_params params; - init_params(¶ms, input_zero_point(), output_zero_point(), input_scale(), output_scale()); - - // Compute reference results - const int32_t input_scale_div = (int32_t) lrintf(256.0f * input_scale() / 6.0f); - const int32_t scale_ratio = (int32_t) lrintf(256.0f * input_scale() / output_scale()); - for (size_t i = 0; i < batch_size(); i++) { - const int32_t input_value = int32_t(uint32_t(input_zero_point() - input[i]) << 7); - int32_t in = input_value * input_scale_div; - in -= 16384; // subtract 0.5 in Q15 - in = std::min(in, 0); - in = std::max(in, -32768); - const int32_t out = math_asr_s32(input_value * scale_ratio + INT32_C(0x4000), 15); - int32_t output_value = math_asr_s32(in * out + INT32_C(0x4000), 15) + output_zero_point(); - output_value = std::min(output_value, std::numeric_limits::max()); - output_value = std::max(output_value, std::numeric_limits::min()); - output_ref[i] = static_cast(output_value); - } - - // Call optimized micro-kernel. - vhswish(batch_size() * sizeof(uint8_t), input.data(), inplace() ? input.data() : output.data(), ¶ms); - - if (inplace()) { - std::copy_n(input.data(), batch_size(), output.data()); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i])) - << "at " << i << " / " << batch_size() - << ", x[" << i << "] = " << int32_t(input[i]); - } - } - } - - private: - float input_scale_ = 128.0f; - float output_scale_ = 128.0f; - int16_t input_zero_point_ = 1; - int16_t output_zero_point_ = 5; - size_t batch_size_ = 1; - size_t iterations_ = 15; - bool inplace_ = false; -}; - -#define XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_eq) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - VHSwishMicrokernelTester() \ - .batch_size(batch_tile * batch_scale) \ - .Test(__VA_ARGS__); \ - } - -#define XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_div) { \ - if (batch_tile == 1) return; \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_step = batch_tile * batch_scale; \ - for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; \ - batch_size += batch_step) { \ - VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_lt) { \ - if (batch_tile == 1) return; \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - for (size_t batch_size = 1; batch_size < batch_end; batch_size++) { \ - VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_gt) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_step = batch_tile * batch_scale; \ - const size_t batch_end = batch_tile == 1 ? 10 : 2 * batch_step; \ - for (size_t batch_size = batch_step + 1; batch_size < batch_end; \ - batch_size++) { \ - VHSwishMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ...) \ - TEST(ukernel, inplace) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - VHSwishMicrokernelTester() \ - .batch_size(batch_size) \ - .inplace(true) \ - .Test(__VA_ARGS__); \ - } \ - } diff --git a/test/vlrelu-microkernel-tester.h b/test/vlrelu-microkernel-tester.h deleted file mode 100644 index 84fea6220f54..000000000000 --- a/test/vlrelu-microkernel-tester.h +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -class VLReLUMicrokernelTester { - public: - VLReLUMicrokernelTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - this->batch_size_ = batch_size; - return *this; - } - - size_t batch_size() const { - return this->batch_size_; - } - - VLReLUMicrokernelTester& positive_scale(float positive_scale) { - assert(positive_scale > 0.0f); - assert(std::isnormal(positive_scale)); - this->positive_scale_ = positive_scale; - return *this; - } - - float positive_scale() const { - return this->positive_scale_; - } - - VLReLUMicrokernelTester& negative_scale(float negative_scale) { - assert(std::isnormal(negative_scale)); - this->negative_scale_ = negative_scale; - return *this; - } - - float negative_scale() const { - return this->negative_scale_; - } - - VLReLUMicrokernelTester& input_zero_point(int16_t input_zero_point) { - this->input_zero_point_ = input_zero_point; - return *this; - } - - int16_t input_zero_point() const { - return this->input_zero_point_; - } - - VLReLUMicrokernelTester& output_zero_point(int16_t output_zero_point) { - this->output_zero_point_ = output_zero_point; - return *this; - } - - int16_t output_zero_point() const { - return this->output_zero_point_; - } - - VLReLUMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test(xnn_qs8_vlrelu_ukernel_fn vlrelu, xnn_init_qs8_lrelu_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - struct xnn_qs8_lrelu_params params; - init_params(¶ms, positive_scale(), negative_scale(), input_zero_point(), output_zero_point()); - - // Call optimized micro-kernel. - vlrelu(batch_size() * sizeof(int8_t), input.data(), output.data(), ¶ms); - - // Compute reference results - const int32_t positive_multiplier = (int32_t) lrintf(-256.0f * positive_scale()); - const int32_t negative_multiplier = (int32_t) lrintf(-256.0f * negative_scale()); - for (size_t i = 0; i < batch_size(); i++) { - const int32_t input_value = (input_zero_point() - input[i]) * 128; - const int32_t multiplier = input_value <= 0 ? positive_multiplier : negative_multiplier; - int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); - output_value = std::min(output_value, std::numeric_limits::max()); - output_value = std::max(output_value, std::numeric_limits::min()); - output_ref[i] = static_cast(output_value); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i])) - << "at " << i << " / " << batch_size() - << ", x[" << i << "] = " << int32_t(input[i]); - } - } - } - - void Test(xnn_qu8_vlrelu_ukernel_fn vlrelu, xnn_init_qu8_lrelu_params_fn init_params) const { - ASSERT_GE(input_zero_point(), std::numeric_limits::min()); - ASSERT_LE(input_zero_point(), std::numeric_limits::max()); - ASSERT_GE(output_zero_point(), std::numeric_limits::min()); - ASSERT_LE(output_zero_point(), std::numeric_limits::max()); - - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - xnnpack::Buffer output(batch_size()); - xnnpack::Buffer output_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - struct xnn_qu8_lrelu_params params; - init_params(¶ms, positive_scale(), negative_scale(), input_zero_point(), output_zero_point()); - - // Call optimized micro-kernel. - vlrelu(batch_size() * sizeof(uint8_t), input.data(), output.data(), ¶ms); - - // Compute reference results - const int32_t positive_multiplier = (int32_t) lrintf(-256.0f * positive_scale()); - const int32_t negative_multiplier = (int32_t) lrintf(-256.0f * negative_scale()); - for (size_t i = 0; i < batch_size(); i++) { - const int32_t input_value = (input_zero_point() - input[i]) * 128; - const int32_t multiplier = input_value <= 0 ? positive_multiplier : negative_multiplier; - int32_t output_value = math_asr_s32(input_value * multiplier + INT32_C(0x4000), 15) + output_zero_point(); - output_value = std::min(output_value, std::numeric_limits::max()); - output_value = std::max(output_value, std::numeric_limits::min()); - output_ref[i] = static_cast(output_value); - } - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(int32_t(output[i]), int32_t(output_ref[i])) - << "at " << i << " / " << batch_size() - << ", x[" << i << "] = " << int32_t(input[i]); - } - } - } - - private: - float positive_scale_ = 1.75f; - float negative_scale_ = 0.75f; - int16_t input_zero_point_ = 1; - int16_t output_zero_point_ = 5; - size_t batch_size_ = 1; - size_t iterations_ = 15; -}; - -// TODO(b/361780131): This could probably be rewritten as some kind of GTest -// instantiate thing instead of macros. -#define XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_eq) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - VLReLUMicrokernelTester() \ - .batch_size(batch_tile* batch_scale) \ - .Test(__VA_ARGS__); \ - } - -#define XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_div) { \ - if (batch_tile == 1) return; \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_step = batch_tile * batch_scale; \ - for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; \ - batch_size += batch_step) { \ - VLReLUMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_lt) { \ - if (batch_tile == 1) return; \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - for (size_t batch_size = 1; batch_size < batch_end; batch_size++) { \ - VLReLUMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_gt) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_step = batch_tile * batch_scale; \ - const size_t batch_end = batch_tile == 1 ? 10 : 2 * batch_step; \ - for (size_t batch_size = batch_step + 1; batch_size < batch_end; \ - batch_size++) { \ - VLReLUMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ - } - -#define XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ...) - -#define XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ...) \ - TEST(ukernel, qmin) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = \ - batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ - for (size_t qmin = 1; qmin < 255; qmin = xnnpack::NextPrime(qmin)) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ - batch_size += batch_step) { \ - VLReLUMicrokernelTester() \ - .batch_size(batch_size) \ - .qmin(qmin) \ - .Test(__VA_ARGS__); \ - } \ - } \ - } - -#define XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ...) \ - TEST(ukernel, qmax) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = \ - batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ - for (size_t qmax = 1; qmax < 255; qmax = xnnpack::NextPrime(qmax)) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ - batch_size += batch_step) { \ - VLReLUMicrokernelTester() \ - .batch_size(batch_size) \ - .qmax(qmax) \ - .Test(__VA_ARGS__); \ - } \ - } \ - } diff --git a/test/vunary-microkernel-tester.cc b/test/vunary-microkernel-tester.cc deleted file mode 100644 index 6e7bd0ed7bd8..000000000000 --- a/test/vunary-microkernel-tester.cc +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include "vunary-microkernel-tester.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/buffer.h" -#include "xnnpack/common.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "replicable_random_device.h" - -#ifndef M_SQRT1_2 -#define M_SQRT1_2 0.7071067811865475244 -#endif - -void VUnaryMicrokernelTester::Test(xnn_f32_vrelu_ukernel_fn vrelu, - xnn_init_f32_relu_params_fn, Default) const { - Test( - vrelu, [](xnn_f32_relu_params*) { return nullptr; }, - [](float x) { return std::max(x, 0.0f); }, TolExact, -1.0f, 1.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_bf16_vabs_ukernel_fn vabs, - xnn_init_bf16_default_params_fn init_params, - Abs) const { - Test( - vabs, InitParamsWrapper(init_params), [](float x) { return std::abs(x); }, - TolExact16, -1.0f, 1.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vabs_ukernel_fn vabs, - xnn_init_f16_default_params_fn init_params, - Abs) const { - Test( - vabs, InitParamsWrapper(init_params), [](float x) { return std::abs(x); }, - TolExact16, -1.0f, 1.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vabs_ukernel_fn vabs, - xnn_init_f32_default_params_fn init_params, - Abs) const { - Test( - vabs, InitParamsWrapper(init_params), [](float x) { return std::abs(x); }, - TolExact, -1.0f, 1.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vclamp_ukernel_fn vclamp, - xnn_init_f32_minmax_params_fn init_params, - Default) const { - Test( - vclamp, - InitParamsWrapper(init_params, static_cast(qmin()), - static_cast(qmax())), - [this](float x) { - return std::max(std::min(x, static_cast(qmax())), - static_cast(qmin())); - }, - TolExact, 0.0f, 255.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_velu_ukernel_fn velu, - xnn_init_f16_elu_params_fn init_params, - Default) const { - Test( - velu, - InitParamsWrapper(init_params, - xnn_float16(prescale()), - xnn_float16(alpha()), - xnn_float16(beta())), - [this](float x) { - return std::signbit(x) ? alpha() * std::expm1(x * prescale()) - : x * beta(); - }, - TolMixed(1.0e-4f, 5.0e-3f), -9.0f, 9.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_velu_ukernel_fn velu, - xnn_init_f32_elu_params_fn init_params, - Default) const { - Test( - velu, InitParamsWrapper(init_params, prescale(), alpha(), beta()), - [this](float x) { - return std::signbit(x) - ? alpha() * std::expm1(static_cast(x) * prescale()) - : static_cast(x) * beta(); - }, - TolMixed(5.0e-6f, 1.0e-5f), -20.0f, 20.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vgelu_ukernel_fn vgelu, - xnn_init_f32_default_params_fn init_params, - Gelu) const { - Test( - vgelu, InitParamsWrapper(init_params), - [](float x) { return x * 0.5f * (1.0f + std::erf(x * M_SQRT1_2)); }, - TolMixed(10 * std::numeric_limits::epsilon(), - 5 * std::numeric_limits::epsilon()), - -10.0f, 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vhswish_ukernel_fn vhswish, - xnn_init_f16_hswish_params_fn init_params, - Default) const { - Test( - vhswish, InitParamsWrapper(init_params), - [](float x) { - return (x / 6.0f) * std::max(std::min(x + 3.0f, 6.0f), 0.0f); - }, - TolMixed(1.0e-3f, 1.0e-2f), -4.0f, 4.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vhswish_ukernel_fn vhswish, - xnn_init_f32_hswish_params_fn init_params, - Default) const { - Test( - vhswish, InitParamsWrapper(init_params), - [](float x) { - return (x / 6.0f) * std::max(std::min(x + 3.0f, 6.0f), 0.0f); - }, - TolMixed(5.0e-6f, 1.0e-5f), -4.0f, 4.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vlrelu_ukernel_fn vlrelu, - xnn_init_f16_lrelu_params_fn init_params, - Default) const { - const xnn_float16 slope_as_half = slope(); - const float slope_as_float = slope_as_half; - Test( - vlrelu, InitParamsWrapper(init_params, slope_as_half), - [slope_as_float](float x) { - return std::signbit(x) ? x * slope_as_float : x; - }, - TolMixed(1.0e-4f, 1.0e-3f), -125.0f, 125.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vlrelu_ukernel_fn vlrelu, - xnn_init_f32_lrelu_params_fn init_params, - Default) const { - Test( - vlrelu, InitParamsWrapper(init_params, slope()), - [this](float x) { return std::signbit(x) ? x * slope() : x; }, TolExact, - -125.0f, 125.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vneg_ukernel_fn vneg, - xnn_init_f16_default_params_fn init_params, - Neg) const { - Test( - vneg, InitParamsWrapper(init_params), [](float x) { return -x; }, - TolExact16, -1.0f, 1.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vneg_ukernel_fn vneg, - xnn_init_f32_default_params_fn init_params, - Neg) const { - Test( - vneg, InitParamsWrapper(init_params), [](float x) { return -x; }, - TolExact, -1.0f, 1.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vround_ukernel_fn vrnd, - OpType op_type, - xnn_init_f16_rnd_params_fn init_params, - Default) const { - Test( - vrnd, InitParamsWrapper(init_params), - [op_type](float x) -> float { - switch (op_type) { - case OpType::RoundToNearestEven: - return std::nearbyint(x); - case OpType::RoundTowardsZero: - return std::trunc(x); - case OpType::RoundUp: - return std::ceil(x); - case OpType::RoundDown: - return std::floor(x); - default: - []() { GTEST_FAIL() << "Unexpected operation type"; }(); - return 0.0f; - } - }, - TolExact16, -5.0f, 5.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vround_ukernel_fn vrnd, - OpType op_type, - xnn_init_f32_rnd_params_fn init_params, - Default) const { - Test( - vrnd, InitParamsWrapper(init_params), - [op_type](float x) -> float { - switch (op_type) { - case OpType::RoundToNearestEven: - return std::nearbyint(x); - case OpType::RoundTowardsZero: - return std::trunc(x); - case OpType::RoundUp: - return std::ceil(x); - case OpType::RoundDown: - return std::floor(x); - default: - []() { GTEST_FAIL() << "Unexpected operation type"; }(); - return 0.0f; - } - }, - TolExact, -5.0f, 5.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vsigmoid_ukernel_fn vsigmoid, - xnn_init_f16_sigmoid_params_fn init_params, - Default) const { - Test( - vsigmoid, InitParamsWrapper(init_params), - [](float x) { - const float e = std::exp(x); - return e / (1.0f + e); - }, - TolMixed(1.0e-4f, 5.0e-3f), -25.0f, 25.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vsigmoid_ukernel_fn vsigmoid, - xnn_init_f32_sigmoid_params_fn init_params, - Default) const { - Test( - vsigmoid, InitParamsWrapper(init_params), - [](float x) { - const double e = std::exp(static_cast(x)); - return e / (1.0 + e); - }, - TolMixed(5.0e-6f, 1.0e-5f), -125.0f, 125.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vsqr_ukernel_fn vsqr, - xnn_init_f16_default_params_fn init_params, - Sqr) const { - Test( - vsqr, InitParamsWrapper(init_params), [](float x) { return x * x; }, - TolMixed(1.0e-4f, 5.0e-3f), -10.0f, 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vsqr_ukernel_fn vsqr, - xnn_init_f32_default_params_fn init_params, - Sqr) const { - Test( - vsqr, InitParamsWrapper(init_params), [](float x) { return x * x; }, - TolExact, -10.0f, 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vsqrt_ukernel_fn vsqrt, - xnn_init_f16_sqrt_params_fn init_params, - Default) const { - Test( - vsqrt, InitParamsWrapper(init_params), - [](float x) { return std::sqrt(x); }, TolMixed(1.0e-4f, 5.0e-3f), 0.001f, - 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vexp_ukernel_fn vexp, - xnn_init_f32_default_params_fn init_params, - Exp) const { - Test( - vexp, InitParamsWrapper(init_params), [](float x) { return std::exp(x); }, - TolMixed(2 * std::numeric_limits::epsilon(), - 6 * std::numeric_limits::epsilon()), - 0.0f, 10.0f); -} -void VUnaryMicrokernelTester::Test(xnn_f32_vlog_ukernel_fn vlog, - xnn_init_f32_default_params_fn init_params, - Log) const { - Test( - vlog, InitParamsWrapper(init_params), [](float x) { return std::log(x); }, - TolMixed(2 * std::numeric_limits::epsilon(), - 6 * std::numeric_limits::epsilon()), - 0.0f, 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vsqrt_ukernel_fn vsqrt, - xnn_init_f32_sqrt_params_fn init_params, - Default) const { - Test( - vsqrt, InitParamsWrapper(init_params), - [](float x) { return std::sqrt(x); }, - TolRelative(2.5f * std::numeric_limits::epsilon()), 0.0f, 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vrsqrt_ukernel_fn vrsqrt, - xnn_init_f16_rsqrt_params_fn init_params, - Default) const { - Test( - vrsqrt, InitParamsWrapper(init_params), - [](float x) { return 1.0f / std::sqrt(x); }, TolMixed(1.0e-4f, 5.0e-3f), - 1.0e-4f, 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vrsqrt_ukernel_fn vrsqrt, - xnn_init_f32_rsqrt_params_fn init_params, - Default) const { - Test( - vrsqrt, InitParamsWrapper(init_params), - [](float x) { return 1.0f / std::sqrt(x); }, - TolRelative(4 * std::numeric_limits::epsilon()), - std::numeric_limits::epsilon(), 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vtanh_ukernel_fn vtanh, - xnn_init_f16_tanh_params_fn init_params, - Default) const { - Test( - vtanh, InitParamsWrapper(init_params), - [](float x) { return std::tanh(x); }, - TolMixed(/*abs_tol=*/1.0e-4f, /*rel_tol=*/5.0e-3f), -5.0f, 5.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f32_vtanh_ukernel_fn vtanh, - xnn_init_f32_tanh_params_fn init_params, - Default) const { - Test( - vtanh, InitParamsWrapper(init_params), - [](float x) { return std::tanh(x); }, - TolRelative(4.0f * std::numeric_limits::epsilon()), // 4 ULP. - -10.0f, 10.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_f16_vclamp_ukernel_fn vclamp, - xnn_init_f16_minmax_params_fn init_params, - Default) const { - Test( - vclamp, - InitParamsWrapper(init_params, - xnn_float16(qmin()), - xnn_float16(qmax())), - [this](float x) { - return std::max(std::min(x, static_cast(qmax())), - static_cast(qmin())); - }, - TolExact16, 0.0f, 255.0f); -} - -void VUnaryMicrokernelTester::Test(xnn_s8_vclamp_ukernel_fn vclamp, - xnn_init_s8_minmax_params_fn init_params, - Default) const { - xnnpack::ReplicableRandomDevice rng; - - xnnpack::Buffer x(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t)); - xnnpack::Buffer y(batch_size() + - (inplace() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0)); - xnnpack::Buffer y_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - xnnpack::fill_uniform_random_bits(x.data(), x.size(), rng); - if (inplace()) { - std::copy(x.cbegin(), x.cend(), y.begin()); - } - const int8_t* x_data = inplace() ? y.data() : x.data(); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - y_ref[i] = - std::min(std::max(x_data[i], static_cast(qmin() - 0x80)), - static_cast(qmax() - 0x80)); - } - - // Prepare parameters. - struct xnn_s8_minmax_params params; - init_params(¶ms, static_cast(qmin() - 0x80), - static_cast(qmax() - 0x80)); - - // Call optimized micro-kernel. - vclamp(batch_size() * sizeof(int8_t), x_data, y.data(), ¶ms); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(static_cast(y_ref[i]), static_cast(y[i])) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << int32_t(x[i]); - } - } -} - -void VUnaryMicrokernelTester::Test(xnn_u8_vclamp_ukernel_fn vclamp, - xnn_init_u8_minmax_params_fn init_params, - Default) const { - xnnpack::ReplicableRandomDevice rng; - - xnnpack::Buffer x(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - xnnpack::Buffer y(batch_size() + - (inplace() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0)); - xnnpack::Buffer y_ref(batch_size()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - xnnpack::fill_uniform_random_bits(x.data(), x.size(), rng); - if (inplace()) { - std::copy(x.cbegin(), x.cend(), y.begin()); - } - const uint8_t* x_data = inplace() ? y.data() : x.data(); - - // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - y_ref[i] = std::min(std::max(x_data[i], qmin()), qmax()); - } - - // Prepare parameters. - struct xnn_u8_minmax_params params; - init_params(¶ms, qmin(), qmax()); - - // Call optimized micro-kernel. - vclamp(batch_size() * sizeof(uint8_t), x_data, y.data(), ¶ms); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - EXPECT_EQ(static_cast(y_ref[i]), static_cast(y[i])) - << "at " << i << " / " << batch_size() << ", x[" << i - << "] = " << uint32_t(x[i]); - } - } -} diff --git a/test/vunary-microkernel-tester.h b/test/vunary-microkernel-tester.h index 6fa97a90f0b6..cbf2f0be0a58 100644 --- a/test/vunary-microkernel-tester.h +++ b/test/vunary-microkernel-tester.h @@ -23,25 +23,35 @@ #include "xnnpack/buffer.h" #include "replicable_random_device.h" -// These help disambiguate Test overloads below. -class Neg {}; -class Abs {}; -class Log {}; -class Sqr {}; -class Exp {}; -class Gelu {}; -class Default {}; +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "xnnpack.h" +#include "xnnpack/buffer.h" +#include "xnnpack/common.h" +#include "xnnpack/isa-checks.h" +#include "xnnpack/math.h" +#include "xnnpack/microfnptr.h" +#include "xnnpack/microparams.h" +#include "replicable_random_device.h" +#include "unary-ops.h" class VUnaryMicrokernelTester { public: - enum class OpType { - ReLU, - RoundToNearestEven, - RoundTowardsZero, - RoundUp, - RoundDown, - }; - VUnaryMicrokernelTester& batch_size(size_t batch_size) { assert(batch_size != 0); this->batch_size_ = batch_size; @@ -57,55 +67,26 @@ class VUnaryMicrokernelTester { bool inplace() const { return this->inplace_; } - VUnaryMicrokernelTester& slope(float slope) { - this->slope_ = slope; - return *this; - } - - float slope() const { return this->slope_; } - - VUnaryMicrokernelTester& prescale(float prescale) { - this->prescale_ = prescale; - return *this; - } - - float prescale() const { return this->prescale_; } - - VUnaryMicrokernelTester& alpha(float alpha) { - this->alpha_ = alpha; + VUnaryMicrokernelTester& input_quantization( + const xnn_quantization_params& quantization) { + this->input_quantization_ = quantization; return *this; } - float alpha() const { return this->alpha_; } - - VUnaryMicrokernelTester& beta(float beta) { - this->beta_ = beta; - return *this; + const xnn_quantization_params& input_quantization() const { + return this->input_quantization_; } - float beta() const { return this->beta_; } - - VUnaryMicrokernelTester& shift(uint32_t shift) { - this->shift_ = shift; + VUnaryMicrokernelTester& output_quantization( + const xnn_quantization_params& quantization) { + this->output_quantization_ = quantization; return *this; } - uint32_t shift() const { return this->shift_; } - - VUnaryMicrokernelTester& qmin(uint8_t qmin) { - this->qmin_ = qmin; - return *this; - } - - uint8_t qmin() const { return this->qmin_; } - - VUnaryMicrokernelTester& qmax(uint8_t qmax) { - this->qmax_ = qmax; - return *this; + const xnn_quantization_params& output_quantization() const { + return this->output_quantization_; } - uint8_t qmax() const { return this->qmax_; } - VUnaryMicrokernelTester& iterations(size_t iterations) { this->iterations_ = iterations; return *this; @@ -113,165 +94,6 @@ class VUnaryMicrokernelTester { size_t iterations() const { return this->iterations_; } - // Wrapper that generate the `init_params` functions needed by `TestFP32` and - // `TestFP16` from the microkernel parameter initializer pointers, for - // different numbers of additional inputs. - template - static std::function - InitParamsWrapper(size_t (*init_params)(UKernelParamsType*, Ts...), - Ts... args) { - return [=](UKernelParamsType* params) -> UKernelParamsType* { - if (init_params != nullptr) { - init_params(params, args...); - return params; - } - return nullptr; - }; - } - - // Tolerance functions for the `TestFP32` and `TestFP16` template functions. - static float TolExact(float) { return 0.0f; } - static float TolExact16(float y_ref) { return std::abs(y_ref) * 5.0e-4f; } - static std::function TolRelative(float rel_tol) { - return [=](float y_ref) -> float { - // Note that `y_ref * rel_tol`, i.e. the expected absolute difference, - // may round differently than `y_ref * (1 + rel_tol) - y_ref`, i.e. the - // effective absolute difference computed in `float`s. We therefore use - // the latter form since it is the true difference between two `float`s - // within the given relative tolerance. - return std::abs(y_ref * (1.0f + rel_tol)) - std::abs(y_ref); - }; - } - static std::function TolMixed(float abs_tol, float rel_tol) { - return [=](float y_ref) -> float { - return std::max(abs_tol, - std::abs(y_ref) * (1.0f + rel_tol) - std::abs(y_ref)); - }; - } - - void Test(xnn_f32_vrelu_ukernel_fn vrelu, - xnn_init_f32_relu_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_bf16_vabs_ukernel_fn vabs, - xnn_init_bf16_default_params_fn init_params = nullptr, - Abs = Abs()) const; - - void Test(xnn_f16_vabs_ukernel_fn vabs, - xnn_init_f16_default_params_fn init_params = nullptr, - Abs = Abs()) const; - - void Test(xnn_f32_vabs_ukernel_fn vabs, - xnn_init_f32_default_params_fn init_params = nullptr, - Abs = Abs()) const; - - void Test(xnn_f32_vclamp_ukernel_fn vclamp, - xnn_init_f32_minmax_params_fn init_params, - Default = Default()) const; - - void Test(xnn_f16_velu_ukernel_fn velu, - xnn_init_f16_elu_params_fn init_params, Default = Default()) const; - - void Test(xnn_f32_velu_ukernel_fn velu, - xnn_init_f32_elu_params_fn init_params, Default = Default()) const; - - void Test(xnn_f32_vexp_ukernel_fn vexp, - xnn_init_f32_default_params_fn init_params = nullptr, - Exp = Exp()) const; - - void Test(xnn_f32_vgelu_ukernel_fn vgelu, - xnn_init_f32_default_params_fn init_params = nullptr, - Gelu = Gelu()) const; - - void Test(xnn_f16_vhswish_ukernel_fn vhswish, - xnn_init_f16_hswish_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f32_vhswish_ukernel_fn vhswish, - xnn_init_f32_hswish_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f16_vlrelu_ukernel_fn vlrelu, - xnn_init_f16_lrelu_params_fn init_params, - Default = Default()) const; - - void Test(xnn_f32_vlrelu_ukernel_fn vlrelu, - xnn_init_f32_lrelu_params_fn init_params, - Default = Default()) const; - - void Test(xnn_f32_vlog_ukernel_fn vlog, - xnn_init_f32_default_params_fn init_params = nullptr, - Log = Log()) const; - - void Test(xnn_f16_vneg_ukernel_fn vneg, - xnn_init_f16_default_params_fn init_params = nullptr, - Neg = Neg()) const; - - void Test(xnn_f32_vneg_ukernel_fn vneg, - xnn_init_f32_default_params_fn init_params = nullptr, - Neg = Neg()) const; - - void Test(xnn_f16_vround_ukernel_fn vrnd, OpType op_type, - xnn_init_f16_rnd_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f32_vround_ukernel_fn vrnd, OpType op_type, - xnn_init_f32_rnd_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f16_vsigmoid_ukernel_fn vsigmoid, - xnn_init_f16_sigmoid_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f32_vsigmoid_ukernel_fn vsigmoid, - xnn_init_f32_sigmoid_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f16_vsqr_ukernel_fn vsqr, - xnn_init_f16_default_params_fn init_params = nullptr, - Sqr = Sqr()) const; - - void Test(xnn_f32_vsqr_ukernel_fn vsqr, - xnn_init_f32_default_params_fn init_params = nullptr, - Sqr = Sqr()) const; - - void Test(xnn_f16_vsqrt_ukernel_fn vsqrt, - xnn_init_f16_sqrt_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f32_vsqrt_ukernel_fn vsqrt, - xnn_init_f32_sqrt_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f16_vrsqrt_ukernel_fn vrsqrt, - xnn_init_f16_rsqrt_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f32_vrsqrt_ukernel_fn vrsqrt, - xnn_init_f32_rsqrt_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f16_vtanh_ukernel_fn vtanh, - xnn_init_f16_tanh_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f32_vtanh_ukernel_fn vtanh, - xnn_init_f32_tanh_params_fn init_params = nullptr, - Default = Default()) const; - - void Test(xnn_f16_vclamp_ukernel_fn vclamp, - xnn_init_f16_minmax_params_fn init_params, - Default = Default()) const; - - void Test(xnn_s8_vclamp_ukernel_fn vclamp, - xnn_init_s8_minmax_params_fn init_params, - Default = Default()) const; - - void Test(xnn_u8_vclamp_ukernel_fn vclamp, - xnn_init_u8_minmax_params_fn init_params, - Default = Default()) const; - - private: // Generic test function for `vunary` kernels. // // The function is templated on the type of the kernel parameters and takes @@ -281,160 +103,302 @@ class VUnaryMicrokernelTester { // `float`. // * `init_params`: A function that populates a given parameters data // structure or returns `nullptr` if there is no default initialization. - // * `ref`: A function that computes the reference result for an input `x`. - // * `tol`: A function that computes the absolute tolerance for a reference - // result `y_ref`. - // * `range_min`, `range_max`: Limits for the range of input values. - template - void Test(void (*ukernel)(size_t, const T*, T*, - const UKernelParamsType*), - InitParamsFunc init_params, ReferenceFunc ref, - ToleranceFunc tol, float range_min, float range_max) const { + template + void Test(void (*ukernel)(size_t, const In*, Out*, const UKernelParamsType*), + xnn_init_unary_uparams_fn init_params, + const xnn_unary_params& params) const { + TestInfo test_info; + auto domain = test_info.Domain(xnnpack::datatype_of()); + domain.min = std::max(domain.min, std::numeric_limits::lowest()); + domain.max = std::min(domain.max, std::numeric_limits::max()); xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist(range_min, range_max); - xnnpack::Buffer x(batch_size() + XNN_EXTRA_BYTES / sizeof(T)); - xnnpack::Buffer y(batch_size() + - (inplace() ? XNN_EXTRA_BYTES / sizeof(T) : 0)); - xnnpack::Buffer y_ref(batch_size()); + xnnpack::Buffer x(batch_size() + XNN_EXTRA_BYTES / sizeof(In)); + xnnpack::Buffer y(batch_size() + + (inplace() ? XNN_EXTRA_BYTES / sizeof(Out) : 0)); + xnnpack::Buffer y_ref(batch_size()); for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(x.begin(), x.end(), [&]() { return f32dist(rng); }); + FillRandom(rng, x.data(), batch_size(), domain, input_quantization_); if (inplace()) { std::copy(x.begin(), x.end(), y.begin()); } - const T* x_data = inplace() ? y.data() : x.data(); + const In* x_data = inplace() ? (const In*)y.data() : x.data(); // Compute reference results. - for (size_t i = 0; i < batch_size(); i++) { - y_ref[i] = ref(x_data[i]); - } + UnaryReferenceImpl(x_data, batch_size(), y_ref.data(), test_info, + input_quantization_, output_quantization_, params); // Initialize the params. - UKernelParamsType params; - const UKernelParamsType* params_ptr = init_params(¶ms); + xnn_unary_uparams uparams; + if (init_params) { + init_params(&uparams, ¶ms, &input_quantization_, + &output_quantization_); + } // Call optimized micro-kernel. - ukernel(batch_size() * sizeof(T), x_data, y.data(), params_ptr); + ukernel(batch_size() * sizeof(In), x_data, y.data(), + (UKernelParamsType*)&uparams); // Verify results. for (size_t i = 0; i < batch_size(); i++) { - ASSERT_NEAR(y[i], y_ref[i], tol(y_ref[i])) + ASSERT_NEAR(y[i], y_ref[i], + test_info.Tolerance(y_ref[i], xnnpack::datatype_of())) << "at " << i << " / " << batch_size() << ", x[" << i << "] = " << std::scientific << x[i]; } } } + template + void Test(void (*ukernel)(size_t, const In*, Out*, const UKernelParamsType*), + xnn_init_unary_uparams_fn init_params) const { + Test(ukernel, init_params, TestInfo().DefaultParams()); + } + + template + void Test(void (*ukernel)(size_t, const In*, Out*, const UKernelParamsType*), + xnn_init_unary_uparams_fn init_params, + const xnn_unary_params& params, std::vector inputs, + const std::vector& expected, int tolerance_ulp) const { + std::vector outputs(inputs.size()); + inputs.resize(inputs.size() + XNN_EXTRA_BYTES / sizeof(In)); + xnn_unary_uparams uparams; + if (init_params) { + init_params(&uparams, ¶ms, nullptr, nullptr); + } + ukernel(outputs.size() * sizeof(In), inputs.data(), outputs.data(), + (UKernelParamsType*)&uparams); + for (size_t i = 0; i < outputs.size(); i++) { + if (std::isfinite(expected[i])) { + EXPECT_NEAR(expected[i], outputs[i], + tolerance_ulp * std::abs(expected[i]) * + std::numeric_limits::epsilon()) + << "for input " << inputs[i]; + } else { + EXPECT_EQ(std::fpclassify(expected[i]), std::fpclassify(outputs[i])) + << "for input " << inputs[i] << " and output " << outputs[i] + << " (FP_INFINITE=" << FP_INFINITE << ", FP_NAN=" << FP_NAN + << ", FP_NORMAL=" << FP_NORMAL << ", FP_SUBNORMAL=" << FP_SUBNORMAL + << ", FP_ZERO=" << FP_ZERO << ")"; + } + } + } + + template + void Test(void (*ukernel)(size_t, const In*, Out*, const UKernelParamsType*), + xnn_init_unary_uparams_fn init_params, std::vector inputs, + const std::vector& expected, int tolerance_ulp) const { + Test(ukernel, init_params, TestInfo().DefaultParams(), inputs, + expected, tolerance_ulp); + } + + private: size_t batch_size_ = 1; bool inplace_ = false; - float slope_ = 0.5f; - float prescale_ = 1.0f; - float alpha_ = 1.0f; - float beta_ = 1.0f; - uint32_t shift_ = 1; - uint8_t qmin_ = 0; - uint8_t qmax_ = 255; + xnn_quantization_params input_quantization_ = {0, 1.0f}; + xnn_quantization_params output_quantization_ = {0, 1.0f}; size_t iterations_ = 15; }; -// TODO(b/361780131): This could probably be rewritten as some kind of GTest -// instantiate thing instead of macros. -#define XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_eq) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - VUnaryMicrokernelTester() \ - .batch_size(batch_tile* batch_scale) \ - .Test(__VA_ARGS__); \ +template +void TestBatchEq(uint64_t arch_flags, size_t batch_tile, UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + VUnaryMicrokernelTester() + .batch_size(batch_tile * batch_scale) + .Test(ukernel, init_params, args...); +} + +template +void TestBatchDiv(uint64_t arch_flags, size_t batch_tile, UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + if (batch_tile == 1) return; + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + const size_t batch_step = batch_tile * batch_scale; + for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; + batch_size += batch_step) { + VUnaryMicrokernelTester() + .batch_size(batch_size) + .Test(ukernel, init_params, args...); } - -#define XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_div) { \ - if (batch_tile == 1) return; \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_step = batch_tile * batch_scale; \ - for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; \ - batch_size += batch_step) { \ - VUnaryMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ +} + +template +void TestBatchLT(uint64_t arch_flags, size_t batch_tile, UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + if (batch_tile == 1) return; + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + const size_t batch_end = batch_tile * batch_scale; + for (size_t batch_size = 1; batch_size < batch_end; batch_size++) { + VUnaryMicrokernelTester() + .batch_size(batch_size) + .Test(ukernel, init_params, args...); } - -#define XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_lt) { \ - if (batch_tile == 1) return; \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - for (size_t batch_size = 1; batch_size < batch_end; batch_size++) { \ - VUnaryMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ +} + +template +void TestBatchGT(uint64_t arch_flags, size_t batch_tile, UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + const size_t batch_step = batch_tile * batch_scale; + const size_t batch_end = batch_tile == 1 ? 10 : 2 * batch_step; + for (size_t batch_size = batch_step + 1; batch_size < batch_end; + batch_size++) { + VUnaryMicrokernelTester() + .batch_size(batch_size) + .Test(ukernel, init_params, args...); } - -#define XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, \ - ...) \ - TEST(ukernel, batch_gt) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_step = batch_tile * batch_scale; \ - const size_t batch_end = batch_tile == 1 ? 10 : 2 * batch_step; \ - for (size_t batch_size = batch_step + 1; batch_size < batch_end; \ - batch_size++) { \ - VUnaryMicrokernelTester().batch_size(batch_size).Test(__VA_ARGS__); \ - } \ +} + +template +void TestInPlace(uint64_t arch_flags, size_t batch_tile, UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + const size_t batch_end = batch_tile * batch_scale; + const size_t batch_step = std::max(1, batch_tile - 1); + for (size_t batch_size = 1; batch_size <= batch_end; + batch_size += batch_step) { + VUnaryMicrokernelTester() + .batch_size(batch_size) + .inplace(true) + .Test(ukernel, init_params, args...); } - -#define XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ...) \ - TEST(ukernel, inplace) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = std::max(1, batch_tile - 1); \ - for (size_t batch_size = 1; batch_size <= batch_end; \ - batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .inplace(true) \ - .Test(__VA_ARGS__); \ - } \ +} + +template +void TestInputScale(uint64_t arch_flags, size_t batch_tile, UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + for (float input_scale : {4.0f, 16.0f, 64.0f}) { + xnn_quantization_params input_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + xnn_quantization_params output_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + input_quantization.scale = input_scale; + VUnaryMicrokernelTester() + .batch_size(batch_size) + .input_quantization(input_quantization) + .output_quantization(output_quantization) + .Test(ukernel, init_params, args...); + } } - -#define XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ...) \ - TEST(ukernel, qmin) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = \ - batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ - for (size_t qmin = 1; qmin < 255; qmin = xnnpack::NextPrime(qmin)) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ - batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .qmin(qmin) \ - .Test(__VA_ARGS__); \ - } \ - } \ +} + +template +void TestOutputScale(uint64_t arch_flags, size_t batch_tile, UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + for (float output_scale : {4.0f, 16.0f, 64.0f}) { + xnn_quantization_params input_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + xnn_quantization_params output_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + output_quantization.scale = output_scale; + VUnaryMicrokernelTester() + .batch_size(batch_size) + .input_quantization(input_quantization) + .output_quantization(output_quantization) + .Test(ukernel, init_params, args...); + } } - -#define XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ...) \ - TEST(ukernel, qmax) { \ - TEST_REQUIRES_ARCH_FLAGS(arch_flags); \ - const size_t batch_scale = get_batch_scale(); \ - const size_t batch_end = batch_tile * batch_scale; \ - const size_t batch_step = \ - batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; \ - for (size_t qmax = 1; qmax < 255; qmax = xnnpack::NextPrime(qmax)) { \ - for (size_t batch_size = 1; batch_size <= 5 * batch_end; \ - batch_size += batch_step) { \ - VUnaryMicrokernelTester() \ - .batch_size(batch_size) \ - .qmax(qmax) \ - .Test(__VA_ARGS__); \ - } \ - } \ +} + +template +void TestInputZeroPoint(uint64_t arch_flags, size_t batch_tile, + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (int16_t input_zero_point = 2; input_zero_point < 10; + input_zero_point += 3) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + xnn_quantization_params input_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + xnn_quantization_params output_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + input_quantization.zero_point = input_zero_point; + VUnaryMicrokernelTester() + .batch_size(batch_size) + .input_quantization(input_quantization) + .output_quantization(output_quantization) + .Test(ukernel, init_params, args...); + } + } +} + +template +void TestOutputZeroPoint(uint64_t arch_flags, size_t batch_tile, + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + for (int16_t output_zero_point = 2; output_zero_point < 10; + output_zero_point += 3) { + for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { + xnn_quantization_params input_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + xnn_quantization_params output_quantization = + TestInfo().InputQuantizationParams(xnnpack::datatype_of()); + output_quantization.zero_point = output_zero_point; + VUnaryMicrokernelTester() + .batch_size(batch_size) + .input_quantization(input_quantization) + .output_quantization(output_quantization) + .Test(ukernel, init_params, args...); + } + } +} + +template +void TestOutputSaturation(uint64_t arch_flags, size_t batch_tile, + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + const size_t batch_end = batch_tile * batch_scale * 5; + const size_t batch_step = std::max(2, batch_end / 8) - 1; + for (size_t batch_size = 1; batch_size <= batch_end; + batch_size += batch_step) { + VUnaryMicrokernelTester() + .batch_size(batch_size) + .output_quantization({0, 500.0f}) + .Test(ukernel, init_params, args...); + } +} + +template +void TestOutputOverflow(uint64_t arch_flags, size_t batch_tile, + UKernelFn ukernel, + xnn_init_unary_uparams_fn init_params, Args... args) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + const size_t batch_end = batch_tile * batch_scale * 5; + const size_t batch_step = std::max(2, batch_end / 8) - 1; + for (size_t batch_size = 1; batch_size <= batch_end; + batch_size += batch_step) { + VUnaryMicrokernelTester() + .batch_size(batch_size) + .output_quantization({0, 4294967296.0f}) + .Test(ukernel, init_params, args...); } +} diff --git a/test/workspace.cc b/test/workspace.cc index c33531c92a63..d1d674ee3f90 100644 --- a/test/workspace.cc +++ b/test/workspace.cc @@ -861,7 +861,7 @@ TEST(WORKSPACE, internally_allocated_dynamic_quantization_parameters) ASSERT_NE(output_id, XNN_INVALID_NODE_ID); xnn_runtime_t runtime = nullptr; - ASSERT_EQ(xnn_status_success, xnn_define_convert(subgraph, input_id, dq_quantized_id, /*flags=*/0)); + ASSERT_EQ(xnn_status_success, xnn_define_unary(subgraph, xnn_unary_convert, /*params=*/nullptr, input_id, dq_quantized_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_define_fully_connected(subgraph, output_min, output_max, dq_quantized_id, kernel_id, bias_id, output_id, /*flags=*/0)); ASSERT_EQ(xnn_status_success, xnn_create_runtime_v3(subgraph, nullptr, nullptr, /*flags=*/0, &runtime)); ASSERT_NE(nullptr, runtime); diff --git a/tools/generate-vunary-test.py b/tools/generate-vunary-test.py index 82593c9c74d7..dacbd9f9f2b6 100755 --- a/tools/generate-vunary-test.py +++ b/tools/generate-vunary-test.py @@ -16,12 +16,6 @@ parser = argparse.ArgumentParser( description="Vector unary operation microkernel test generator" ) -parser.add_argument("-t", "--tester", metavar="TESTER", required=True, - choices=[ - "VHSwishMicrokernelTester", - "VLReLUMicrokernelTester", - "VUnaryMicrokernelTester"], - help="Tester class to be used in the generated test") parser.add_argument( "-k", "--ukernel", @@ -59,19 +53,19 @@ "vtanh": "TanH", } +PARAMS_TYPES = ["Clamp", "ELU", "LeakyReLU"] + SPECIAL_VALUES_F32 = { "SquareRoot": ( 4, # Number of elements. "{0.0f, -0.0f, 1.0f, -1.0f}", # Inputs. "{0.0f, -0.0f, 1.0f, NAN}", # Expected outputs. - "struct xnn_f32_sqrt_params", # Params name. 1, # Error margin in ULP. ), "TanH": ( 7, # Number of elements. "{0.0f, -0.0f, 10.0f, -10.0f, INFINITY, -INFINITY, NAN}", "{0.0f, -0.0f, 1.0f, -1.0f, 1.0f, -1.0f, NAN}", - "union xnn_f32_tanh_params", # TODO: b/338934971 - This should be `1` ulp, but this fails on # `cmake-linux-riscv64-rvv` (but not on `cmake-linux-riscv64`). 3, @@ -80,53 +74,68 @@ 4, # Number of elements. "{1.0f, -1.0f, 0.0f, -0.0f}", # Inputs. "{0.0f, NAN, -INFINITY, -INFINITY}", # Expected outputs. - "struct xnn_f32_default_params", 1, # Error margin in ULP. ), "GELU": ( 3, # Number of elements. "{-6.0f, 6.0f, 0.0f}", # Inputs. "{0.0f, 6.0f, 0.0f}", # Expected outputs. - "struct xnn_f32_default_params", 1, # Error margin in ULP. ), "Exp": ( 3, # Number of elements. "{0.0f, -1e3f, 1e3f}", # Inputs. "{1.0f, 0.0f, INFINITY}", # Expected outputs. - "struct xnn_f32_default_params", 1, # Error margin in ULP. ), } TEST_TEMPLATE = """\ #define XNN_UKERNEL_WITH_PARAMS(arch_flags, ukernel, batch_tile, vector_tile, datatype, params_type, init_params) - -XNN_TEST_UNARY_BATCH_EQ(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); -XNN_TEST_UNARY_BATCH_DIV(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); -XNN_TEST_UNARY_BATCH_LT(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); -XNN_TEST_UNARY_BATCH_GT(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); - -XNN_TEST_UNARY_INPLACE(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); + TEST(ukernel, batch_eq) { TestBatchEq(arch_flags, batch_tile, ukernel, init_params); } + TEST(ukernel, batch_div) { TestBatchDiv(arch_flags, batch_tile, ukernel, init_params); } + TEST(ukernel, batch_lt) { TestBatchLT(arch_flags, batch_tile, ukernel, init_params); } + TEST(ukernel, batch_gt) { TestBatchGT(arch_flags, batch_tile, ukernel, init_params); } + TEST(ukernel, inplace) { TestInPlace(arch_flags, batch_tile, ukernel, init_params); } $if OP_TYPE == "Clamp": - XNN_TEST_UNARY_QMIN(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); - XNN_TEST_UNARY_QMAX(ukernel, arch_flags, batch_tile, datatype, ${", ".join(TEST_ARGS)}); -$if OP_TYPE == "ELU": - TEST(ukernel, prescale) { + TEST(ukernel, clamp_min) { TEST_REQUIRES_ARCH_FLAGS(arch_flags); const size_t batch_scale = get_batch_scale(); const size_t batch_end = batch_tile * batch_scale; - const size_t batch_step = std::max(1, batch_tile - 1); - for (float prescale : std::array({0.1f, 10.0f})) { - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { - ${TESTER}() - .batch_size(batch_size) - .prescale(prescale) - .Test(${", ".join(TEST_ARGS)}); + const size_t batch_step = + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; + for (size_t min = 1; min < 255; min = xnnpack::NextPrime(min)) { + for (size_t batch_size = 1; batch_size <= 5 * batch_end; + batch_size += batch_step) { + xnn_unary_params params; + params.clamp.min = min; + params.clamp.max = 255; + VUnaryMicrokernelTester() + .batch_size(batch_size) + .Test(ukernel, init_params, params); } } } + TEST(ukernel, clamp_max) { + TEST_REQUIRES_ARCH_FLAGS(arch_flags); + const size_t batch_scale = get_batch_scale(); + const size_t batch_end = batch_tile * batch_scale; + const size_t batch_step = + batch_scale == 1 ? std::max(1, batch_tile - 1) : batch_end - 1; + for (size_t max = 1; max < 255; max = xnnpack::NextPrime(max)) { + for (size_t batch_size = 1; batch_size <= 5 * batch_end; + batch_size += batch_step) { + xnn_unary_params params; + params.clamp.min = 0; + params.clamp.max = max; + VUnaryMicrokernelTester() + .batch_size(batch_size) + .Test(ukernel, init_params, params); + } + } + } +$if OP_TYPE == "ELU": TEST(ukernel, alpha) { TEST_REQUIRES_ARCH_FLAGS(arch_flags); const size_t batch_scale = get_batch_scale(); @@ -134,164 +143,42 @@ const size_t batch_step = std::max(1, batch_tile - 1); for (float alpha : std::array({0.3f, 3.0f})) { for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { + xnn_unary_params params; + params.elu.alpha = alpha; ${TESTER}() .batch_size(batch_size) - .alpha(alpha) - .Test(${", ".join(TEST_ARGS)}); + .Test(ukernel, init_params, params); } } } - - TEST(ukernel, beta) { +$if OP_TYPE == "LeakyReLU": + TEST(ukernel, negative_slope) { TEST_REQUIRES_ARCH_FLAGS(arch_flags); const size_t batch_scale = get_batch_scale(); const size_t batch_end = batch_tile * batch_scale; const size_t batch_step = std::max(1, batch_tile - 1); - for (float beta : std::array({0.3f, 3.0f})) { + for (float negative_slope : std::array({0.01f, 0.3f, 1.3f})) { + xnn_unary_params params; + params.leaky_relu.negative_slope = negative_slope; for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { ${TESTER}() .batch_size(batch_size) - .beta(beta) - .Test(${", ".join(TEST_ARGS)}); + .Test(ukernel, init_params, params); } } } -$if OP_TYPE == "LeakyReLU": - $if "f" in DATATYPE: - TEST(ukernel, slope) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - const size_t batch_scale = get_batch_scale(); - const size_t batch_end = batch_tile * batch_scale; - const size_t batch_step = std::max(1, batch_tile - 1); - for (float slope : std::array({-0.7f, 0.3f, 1.3f})) { - for (size_t batch_size = 1; batch_size <= 5 * batch_end; batch_size += batch_step) { - ${TESTER}() - .batch_size(batch_size) - .slope(slope) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - $else: - TEST(ukernel, positive_scale) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - for (size_t batch_size = 1; batch_size <= batch_tile * 5; batch_size += std::max(1, batch_tile - 1)) { - for (float positive_scale : {1.0f / 256.0f, 0.3f, 1.3f, 128.0f}) { - ${TESTER}() - .batch_size(batch_size) - .positive_scale(positive_scale) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(ukernel, negative_scale) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - for (size_t batch_size = 1; batch_size <= batch_tile * 5; batch_size += std::max(1, batch_tile - 1)) { - for (float negative_scale : {-127.99609375f, -1.3f, -0.3f, -1.0f / 256.0f, 1 / 256.0f, 0.3f, 1.3f, 128.0f}) { - ${TESTER}() - .batch_size(batch_size) - .negative_scale(negative_scale) - $if DATATYPE == "QU8": - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } - } -$if OP_TYPE == "HardSwish": - $if "f" not in DATATYPE: - TEST(ukernel, input_scale) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float input_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_scale(input_scale) - $if "qu8" in DATATYPE: - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(ukernel, output_scale) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - for (float output_scale : {4.0f, 16.0f, 64.0f}) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .output_scale(output_scale) - $if "qu8" in DATATYPE: - .input_zero_point(150) - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(ukernel, input_zero_point) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - for (int16_t input_zero_point = 2; input_zero_point < 10; input_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - .input_zero_point(input_zero_point) - $if "qu8" in DATATYPE: - .output_zero_point(100) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(ukernel, output_zero_point) { - TEST_REQUIRES_ARCH_FLAGS(arch_flags); - for (int16_t output_zero_point = 2; output_zero_point < 10; output_zero_point += 3) { - for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) { - VHSwishMicrokernelTester() - .batch_size(batch_size) - $if "qu8" in DATATYPE: - .input_zero_point(150) - .output_zero_point(output_zero_point) - .Test(${", ".join(TEST_ARGS)}); - } - } - } +$if "q" in DATATYPE: + TEST(ukernel, input_scale) { TestInputScale(arch_flags, batch_tile, ukernel, init_params); } + TEST(ukernel, output_scale) { TestOutputScale(arch_flags, batch_tile, ukernel, init_params); } + TEST(ukernel, input_zero_point) { TestInputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } + TEST(ukernel, output_zero_point) { TestOutputZeroPoint(arch_flags, batch_tile, ukernel, init_params); } $if DATATYPE == "f32" and OP_TYPE in SPECIAL_VALUES_F32: TEST(ukernel, special_values) { TEST_REQUIRES_ARCH_FLAGS(arch_flags); - constexpr size_t num_elements = ${SPECIAL_VALUES_F32[OP_TYPE][0]}; - constexpr size_t buffered_size = - num_elements + XNN_EXTRA_BYTES / sizeof(float); - std::array inputs = - ${SPECIAL_VALUES_F32[OP_TYPE][1]}; - std::array expected = - ${SPECIAL_VALUES_F32[OP_TYPE][2]}; - std::array outputs; - ${SPECIAL_VALUES_F32[OP_TYPE][3]} params; - if (${TEST_ARGS[1]}) { - ${TEST_ARGS[1]}(¶ms); - } - ${TEST_ARGS[0]}( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); - for (int i = 0; i < num_elements; i++) { - if (std::isfinite(expected[i])) { - EXPECT_NEAR( - expected[i], outputs[i], - ${SPECIAL_VALUES_F32[OP_TYPE][4]} * std::abs(expected[i]) * std::numeric_limits::epsilon()) - << "for input " << inputs[i]; - } else { - EXPECT_EQ(std::fpclassify(expected[i]), std::fpclassify(outputs[i])) - << "for input " << inputs[i] << " and output " << outputs[i] - << " (FP_INFINITE=" << FP_INFINITE << ", FP_NAN=" << FP_NAN - << ", FP_NORMAL=" << FP_NORMAL << ", FP_SUBNORMAL=" << FP_SUBNORMAL - << ", FP_ZERO=" << FP_ZERO << ")"; - } - } + VUnaryMicrokernelTester().Test(ukernel, init_params, + /*inputs=*/${SPECIAL_VALUES_F32[OP_TYPE][1]}, + /*outputs=*/${SPECIAL_VALUES_F32[OP_TYPE][2]}, + /*tolerance_ulp=*/${SPECIAL_VALUES_F32[OP_TYPE][3]}); } """ @@ -303,13 +190,8 @@ def main(args): op = parts[-1] op_type = OP_TYPES[op] - tester = options.tester - tester_header = { - "VHSwishMicrokernelTester": "vhswish-microkernel-tester.h", - "VLReLUMicrokernelTester": "vlrelu-microkernel-tester.h", - "VUnaryMicrokernelTester": "vunary-microkernel-tester.h", - }[tester] - + tester = "VUnaryMicrokernelTester" + tester_header = "vunary-microkernel-tester.h" op_header = "vunary.h" tests = """\ // Copyright 2019 Google LLC @@ -345,21 +227,12 @@ def main(args): tester_header=tester_header, ) - test_args = ["ukernel"] - if op_type.startswith("Round"): - test_args.append(tester + "::OpType::" + op_type) - test_args.append("init_params") + test_args = ["ukernel", "init_params"] + + tests += """\ +using TestInfo = {op_type}; - disambiguate = { - "Abs": "Abs", - "GELU": "Gelu", - "Exp": "Exp", - "Log": "Log", - "Negate": "Neg", - "Square": "Sqr", - }.get(op_type, None) - if disambiguate: - test_args.append(disambiguate + "()") +""".format(op_type=op_type) tests += xnncommon.make_multiline_macro(xngen.preprocess( TEST_TEMPLATE,