From c97e447c40c2a8ddcc1f3b4590cfbe0fd4b8f5c4 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Wed, 31 Jul 2024 15:55:56 -0700 Subject: [PATCH] Remove f16_f32_vcvt_params struct PiperOrigin-RevId: 658171197 --- bench/f16-f32-vcvt.cc | 132 +++--- bench/f16-vabs.cc | 8 +- bench/f16-vneg.cc | 8 +- bench/vcvt-benchmark.h | 8 +- src/amalgam/gen/avx.c | 14 +- src/amalgam/gen/avx512skx.c | 2 +- src/amalgam/gen/f16c.c | 2 +- src/amalgam/gen/neon.c | 4 +- src/amalgam/gen/neonfp16.c | 2 +- src/amalgam/gen/neonfp16arith.c | 4 +- src/amalgam/gen/scalar.c | 28 +- src/amalgam/gen/sse2.c | 22 +- src/amalgam/gen/sse41.c | 14 +- src/amalgam/gen/wasmrelaxedsimd.c | 14 +- src/amalgam/gen/wasmsimd.c | 14 +- src/bf16-vunary/gen/bf16-vabs-neonbf16-u16.c | 4 +- src/bf16-vunary/gen/bf16-vabs-neonbf16-u24.c | 4 +- src/bf16-vunary/gen/bf16-vabs-neonbf16-u8.c | 4 +- src/bf16-vunary/neon.c.in | 8 +- src/configs/unary-elementwise-config.c | 12 - src/f16-f32-vcvt/avx512skx.c.in | 2 +- src/f16-f32-vcvt/f16c.c.in | 2 +- .../gen/f16-f32-vcvt-avx-int16-u16.c | 14 +- .../gen/f16-f32-vcvt-avx-int16-u24.c | 14 +- .../gen/f16-f32-vcvt-avx-int16-u32.c | 14 +- .../gen/f16-f32-vcvt-avx-int16-u8.c | 14 +- .../gen/f16-f32-vcvt-avx-int32-u16.c | 12 +- .../gen/f16-f32-vcvt-avx-int32-u24.c | 12 +- .../gen/f16-f32-vcvt-avx-int32-u32.c | 12 +- .../gen/f16-f32-vcvt-avx-int32-u8.c | 12 +- .../gen/f16-f32-vcvt-avx512skx-u16.c | 2 +- .../gen/f16-f32-vcvt-avx512skx-u32.c | 2 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c | 2 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c | 2 +- .../gen/f16-f32-vcvt-neon-int16-u16.c | 4 +- .../gen/f16-f32-vcvt-neon-int16-u24.c | 4 +- .../gen/f16-f32-vcvt-neon-int16-u32.c | 4 +- .../gen/f16-f32-vcvt-neon-int16-u8.c | 4 +- .../gen/f16-f32-vcvt-neon-int32-u16.c | 4 +- .../gen/f16-f32-vcvt-neon-int32-u24.c | 4 +- .../gen/f16-f32-vcvt-neon-int32-u32.c | 4 +- .../gen/f16-f32-vcvt-neon-int32-u8.c | 4 +- .../gen/f16-f32-vcvt-neonfp16-u16.c | 2 +- .../gen/f16-f32-vcvt-neonfp16-u8.c | 2 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c | 14 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c | 14 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c | 14 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c | 14 +- .../gen/f16-f32-vcvt-sse2-int16-u16.c | 14 +- .../gen/f16-f32-vcvt-sse2-int16-u24.c | 14 +- .../gen/f16-f32-vcvt-sse2-int16-u32.c | 14 +- .../gen/f16-f32-vcvt-sse2-int16-u8.c | 14 +- .../gen/f16-f32-vcvt-sse2-int32-u16.c | 12 +- .../gen/f16-f32-vcvt-sse2-int32-u24.c | 12 +- .../gen/f16-f32-vcvt-sse2-int32-u32.c | 12 +- .../gen/f16-f32-vcvt-sse2-int32-u8.c | 12 +- .../gen/f16-f32-vcvt-sse41-int16-u16.c | 14 +- .../gen/f16-f32-vcvt-sse41-int16-u24.c | 14 +- .../gen/f16-f32-vcvt-sse41-int16-u32.c | 14 +- .../gen/f16-f32-vcvt-sse41-int16-u8.c | 14 +- .../gen/f16-f32-vcvt-sse41-int32-u16.c | 12 +- .../gen/f16-f32-vcvt-sse41-int32-u24.c | 12 +- .../gen/f16-f32-vcvt-sse41-int32-u32.c | 12 +- .../gen/f16-f32-vcvt-sse41-int32-u8.c | 12 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c | 14 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c | 14 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c | 14 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c | 14 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c | 12 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c | 12 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c | 12 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c | 12 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u16.c | 14 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u24.c | 14 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u32.c | 14 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u8.c | 14 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u16.c | 12 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u24.c | 12 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u32.c | 12 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u8.c | 12 +- src/f16-f32-vcvt/neon-int16.c.in | 4 +- src/f16-f32-vcvt/neon-int32.c.in | 4 +- src/f16-f32-vcvt/neonfp16.c.in | 2 +- src/f16-f32-vcvt/scalar.c.in | 14 +- src/f16-f32-vcvt/sse-int16.c.in | 14 +- src/f16-f32-vcvt/sse-int32.c.in | 12 +- src/f16-f32-vcvt/wasmsimd-int16.c.in | 14 +- src/f16-f32-vcvt/wasmsimd-int32.c.in | 12 +- .../gen/f16-vabs-neonfp16arith-u16.c | 2 +- .../gen/f16-vabs-neonfp16arith-u8.c | 2 +- src/f16-vunary/gen/f16-vabs-sse2-u16.c | 4 +- src/f16-vunary/gen/f16-vabs-sse2-u8.c | 4 +- .../gen/f16-vneg-neonfp16arith-u16.c | 2 +- .../gen/f16-vneg-neonfp16arith-u8.c | 2 +- src/f16-vunary/gen/f16-vneg-sse2-u16.c | 4 +- src/f16-vunary/gen/f16-vneg-sse2-u8.c | 4 +- src/f16-vunary/neonfp16arith.c.in | 7 +- src/f16-vunary/sse2.c.in | 10 +- src/microparams-init.c | 129 ------ src/operators/unary-elementwise-nc.c | 32 +- src/xnnpack/compute.h | 7 +- src/xnnpack/config-types.h | 4 +- src/xnnpack/microfnptr.h | 21 +- src/xnnpack/microparams-init.h | 36 +- src/xnnpack/microparams.h | 88 +--- src/xnnpack/operator.h | 3 - src/xnnpack/vcvt.h | 2 +- src/xnnpack/vunary.h | 6 +- test/bf16-vabs.cc | 30 +- test/bf16-vabs.yaml | 3 - test/f16-f32-vcvt.cc | 412 +++++++++--------- test/f16-f32-vcvt.yaml | 52 --- test/f16-vabs.cc | 20 +- test/f16-vabs.yaml | 2 - test/f16-vneg.cc | 20 +- test/f16-vneg.yaml | 2 - test/vcvt-microkernel-tester.cc | 10 +- test/vcvt-microkernel-tester.h | 3 +- test/vunary-microkernel-tester.cc | 6 +- test/vunary-microkernel-tester.h | 6 +- tools/generate-vcvt-test.py | 6 +- tools/generate-vunary-benchmark.py | 2 +- 122 files changed, 783 insertions(+), 1168 deletions(-) diff --git a/bench/f16-f32-vcvt.cc b/bench/f16-f32-vcvt.cc index 8eafa104d351..d4a26bcc8072 100644 --- a/bench/f16-f32-vcvt.cc +++ b/bench/f16-f32-vcvt.cc @@ -20,8 +20,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u8, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -29,8 +28,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u16, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -38,8 +36,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u24, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -47,8 +44,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u32, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -56,8 +52,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u8, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -65,8 +60,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u16, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -74,8 +68,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u24, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -83,8 +76,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u32, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -92,8 +84,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u8, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -101,8 +92,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u16, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -110,8 +100,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u24, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -119,8 +108,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u32, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -128,8 +116,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u8, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -137,8 +124,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u16, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -146,8 +132,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u24, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -155,8 +140,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u32, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -165,7 +149,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u8, xnn_f16_f32_vcvt_ukernel__neon_int16_u8, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -175,7 +159,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u16, xnn_f16_f32_vcvt_ukernel__neon_int16_u16, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -185,7 +169,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u24, xnn_f16_f32_vcvt_ukernel__neon_int16_u24, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -195,7 +179,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u32, xnn_f16_f32_vcvt_ukernel__neon_int16_u32, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -205,7 +189,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u8, xnn_f16_f32_vcvt_ukernel__neon_int32_u8, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -215,7 +199,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u16, xnn_f16_f32_vcvt_ukernel__neon_int32_u16, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -225,7 +209,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u24, xnn_f16_f32_vcvt_ukernel__neon_int32_u24, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -235,7 +219,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u32, xnn_f16_f32_vcvt_ukernel__neon_int32_u32, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -265,7 +249,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u8, xnn_f16_f32_vcvt_ukernel__avx_int16_u8, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -275,7 +259,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u16, xnn_f16_f32_vcvt_ukernel__avx_int16_u16, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -285,7 +269,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u24, xnn_f16_f32_vcvt_ukernel__avx_int16_u24, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -295,7 +279,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u32, xnn_f16_f32_vcvt_ukernel__avx_int16_u32, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -305,7 +289,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u8, xnn_f16_f32_vcvt_ukernel__avx_int32_u8, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -315,7 +299,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u16, xnn_f16_f32_vcvt_ukernel__avx_int32_u16, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -325,7 +309,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u24, xnn_f16_f32_vcvt_ukernel__avx_int32_u24, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -335,7 +319,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u32, xnn_f16_f32_vcvt_ukernel__avx_int32_u32, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -345,7 +329,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u8, xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -355,7 +339,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u16, xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -365,7 +349,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u24, xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -375,7 +359,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u32, xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -385,7 +369,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u8, xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -395,7 +379,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u16, xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -405,7 +389,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u24, xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -415,7 +399,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u32, xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -424,8 +408,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u8, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -433,8 +416,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u16, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -442,8 +424,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u24, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -451,8 +432,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u32, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -460,8 +440,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u8, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -469,8 +448,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u16, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -478,8 +456,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u24, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -487,8 +464,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u32, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -535,26 +511,22 @@ BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u1, - xnn_f16_f32_vcvt_ukernel__scalar_u1, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u1) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u2, - xnn_f16_f32_vcvt_ukernel__scalar_u2, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u2) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u3, - xnn_f16_f32_vcvt_ukernel__scalar_u3, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u4, - xnn_f16_f32_vcvt_ukernel__scalar_u4, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u4) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); diff --git a/bench/f16-vabs.cc b/bench/f16-vabs.cc index 99879e4f871c..ce3263a2aac2 100644 --- a/bench/f16-vabs.cc +++ b/bench/f16-vabs.cc @@ -20,9 +20,9 @@ #include "xnnpack/vunary.h" void f16_vabs(benchmark::State& state, xnn_f16_vabs_ukernel_fn ukernel, - xnn_init_f16_abs_params_fn init_params = nullptr, + xnn_init_f16_default_params_fn init_params = nullptr, benchmark::utils::IsaCheckFunction isa_check = nullptr) { - f16_vunary_benchmark( + f16_vunary_benchmark( state, ukernel, init_params, isa_check, @@ -48,12 +48,12 @@ void f16_vabs(benchmark::State& state, xnn_f16_vabs_ukernel_fn ukernel, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_vabs, sse2_u8, xnn_f16_vabs_ukernel__sse2_u8, - xnn_init_f16_abs_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_vabs, sse2_u16, xnn_f16_vabs_ukernel__sse2_u16, - xnn_init_f16_abs_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/bench/f16-vneg.cc b/bench/f16-vneg.cc index b86ab2b97b0e..f3ad814eef78 100644 --- a/bench/f16-vneg.cc +++ b/bench/f16-vneg.cc @@ -20,9 +20,9 @@ #include "xnnpack/vunary.h" void f16_vneg(benchmark::State& state, xnn_f16_vneg_ukernel_fn ukernel, - xnn_init_f16_neg_params_fn init_params = nullptr, + xnn_init_f16_default_params_fn init_params = nullptr, benchmark::utils::IsaCheckFunction isa_check = nullptr) { - f16_vunary_benchmark( + f16_vunary_benchmark( state, ukernel, init_params, isa_check, @@ -48,12 +48,12 @@ void f16_vneg(benchmark::State& state, xnn_f16_vneg_ukernel_fn ukernel, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_vneg, sse2_u8, xnn_f16_vneg_ukernel__sse2_u8, - xnn_init_f16_neg_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_vneg, sse2_u16, xnn_f16_vneg_ukernel__sse2_u16, - xnn_init_f16_neg_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/bench/vcvt-benchmark.h b/bench/vcvt-benchmark.h index 4813678b7fff..b7512dadd215 100644 --- a/bench/vcvt-benchmark.h +++ b/bench/vcvt-benchmark.h @@ -26,7 +26,7 @@ namespace { static void f16_f32_vcvt( benchmark::State& state, xnn_f16_f32_vcvt_ukernel_fn cvt, - xnn_init_f16_f32_cvt_params_fn init_params = nullptr, + void* /*init_params*/ = nullptr, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check && !isa_check(state)) { @@ -45,12 +45,8 @@ static void f16_f32_vcvt( std::generate(x.begin(), x.end(), std::ref(f16rng)); std::fill(y.begin(), y.end(), std::nanf("")); - xnn_f16_f32_cvt_params params; - if (init_params != nullptr) { - init_params(¶ms); - } for (auto _ : state) { - cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms); + cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), nullptr); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); diff --git a/src/amalgam/gen/avx.c b/src/amalgam/gen/avx.c index 72696b90bfcd..9036f135f3b3 100644 --- a/src/amalgam/gen/avx.c +++ b/src/amalgam/gen/avx.c @@ -36,19 +36,19 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/avx512skx.c b/src/amalgam/gen/avx512skx.c index 21d4a8f9f29e..68006efee135 100644 --- a/src/amalgam/gen/avx512skx.c +++ b/src/amalgam/gen/avx512skx.c @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/amalgam/gen/f16c.c b/src/amalgam/gen/f16c.c index 809dc311db79..95dbb7c35c30 100644 --- a/src/amalgam/gen/f16c.c +++ b/src/amalgam/gen/f16c.c @@ -541,7 +541,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/amalgam/gen/neon.c b/src/amalgam/gen/neon.c index 70165d58bc36..14206cdf69b2 100644 --- a/src/amalgam/gen/neon.c +++ b/src/amalgam/gen/neon.c @@ -55,7 +55,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -64,7 +64,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16( const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); diff --git a/src/amalgam/gen/neonfp16.c b/src/amalgam/gen/neonfp16.c index e0e82570d78d..96adf1682c94 100644 --- a/src/amalgam/gen/neonfp16.c +++ b/src/amalgam/gen/neonfp16.c @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/amalgam/gen/neonfp16arith.c b/src/amalgam/gen/neonfp16arith.c index 498004a49d21..f333c9b0d4b7 100644 --- a/src/amalgam/gen/neonfp16arith.c +++ b/src/amalgam/gen/neonfp16arith.c @@ -10835,7 +10835,7 @@ void xnn_f16_vabs_ukernel__neonfp16arith_u16( size_t batch, const void* input, void* output, - const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -10881,7 +10881,7 @@ void xnn_f16_vneg_ukernel__neonfp16arith_u16( size_t batch, const void* input, void* output, - const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/amalgam/gen/scalar.c b/src/amalgam/gen/scalar.c index 26db94db6f3e..7fe28d484646 100644 --- a/src/amalgam/gen/scalar.c +++ b/src/amalgam/gen/scalar.c @@ -57,19 +57,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u1( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; @@ -93,19 +93,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u4( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/amalgam/gen/sse2.c b/src/amalgam/gen/sse2.c index fde220e7e8ff..b9cc5c0e2661 100644 --- a/src/amalgam/gen/sse2.c +++ b/src/amalgam/gen/sse2.c @@ -48,19 +48,19 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { @@ -224,7 +224,7 @@ void xnn_f16_vabs_ukernel__sse2_u16( size_t batch, const void* input, void* output, - const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -233,7 +233,7 @@ void xnn_f16_vabs_ukernel__sse2_u16( const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; - const __m128i vnonsign_mask = _mm_load_si128((const __m128i*) params->sse.nonsign_mask); + const __m128i vnonsign_mask = _mm_set1_epi16(0x7FFF); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { __m128i vacc0 = _mm_loadu_si128((const __m128i*) i); __m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8)); @@ -276,7 +276,7 @@ void xnn_f16_vneg_ukernel__sse2_u16( size_t batch, const void* input, void* output, - const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -285,7 +285,7 @@ void xnn_f16_vneg_ukernel__sse2_u16( const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse.sign_mask); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { __m128i vacc0 = _mm_loadu_si128((const __m128i*) i); __m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8)); diff --git a/src/amalgam/gen/sse41.c b/src/amalgam/gen/sse41.c index fa0ad329bb26..059510373fa8 100644 --- a/src/amalgam/gen/sse41.c +++ b/src/amalgam/gen/sse41.c @@ -35,19 +35,19 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/wasmrelaxedsimd.c b/src/amalgam/gen/wasmrelaxedsimd.c index 273221717d4b..62620e5e6832 100644 --- a/src/amalgam/gen/wasmrelaxedsimd.c +++ b/src/amalgam/gen/wasmrelaxedsimd.c @@ -31,19 +31,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/wasmsimd.c b/src/amalgam/gen/wasmsimd.c index 86b5c748e187..9e7973d22725 100644 --- a/src/amalgam/gen/wasmsimd.c +++ b/src/amalgam/gen/wasmsimd.c @@ -55,19 +55,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/bf16-vunary/gen/bf16-vabs-neonbf16-u16.c b/src/bf16-vunary/gen/bf16-vabs-neonbf16-u16.c index 565dd62474d4..006e1f01ed11 100644 --- a/src/bf16-vunary/gen/bf16-vabs-neonbf16-u16.c +++ b/src/bf16-vunary/gen/bf16-vabs-neonbf16-u16.c @@ -19,7 +19,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u16( size_t batch, const void* input, void* output, - const union xnn_bf16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_bf16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(bfloat16_t) == 0); @@ -28,7 +28,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u16( const bfloat16_t* i = (const bfloat16_t*) input; bfloat16_t* o = (bfloat16_t*) output; - uint16x8_t vmask = vld1q_u16(params->neon.nonsign_mask); + uint16x8_t vmask = vdupq_n_u16(0x7FFF); for (; batch >= 16 * sizeof(bfloat16_t); batch -= 16 * sizeof(bfloat16_t)) { const bfloat16x8_t vx01234567 = vld1q_bf16(i); i+= 8; const bfloat16x8_t vx89ABCDEF = vld1q_bf16(i); i+= 8; diff --git a/src/bf16-vunary/gen/bf16-vabs-neonbf16-u24.c b/src/bf16-vunary/gen/bf16-vabs-neonbf16-u24.c index e4dcd3037ff7..b3e442bc767d 100644 --- a/src/bf16-vunary/gen/bf16-vabs-neonbf16-u24.c +++ b/src/bf16-vunary/gen/bf16-vabs-neonbf16-u24.c @@ -19,7 +19,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u24( size_t batch, const void* input, void* output, - const union xnn_bf16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_bf16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(bfloat16_t) == 0); @@ -28,7 +28,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u24( const bfloat16_t* i = (const bfloat16_t*) input; bfloat16_t* o = (bfloat16_t*) output; - uint16x8_t vmask = vld1q_u16(params->neon.nonsign_mask); + uint16x8_t vmask = vdupq_n_u16(0x7FFF); for (; batch >= 24 * sizeof(bfloat16_t); batch -= 24 * sizeof(bfloat16_t)) { const bfloat16x8_t vx01234567 = vld1q_bf16(i); i+= 8; const bfloat16x8_t vx89ABCDEF = vld1q_bf16(i); i+= 8; diff --git a/src/bf16-vunary/gen/bf16-vabs-neonbf16-u8.c b/src/bf16-vunary/gen/bf16-vabs-neonbf16-u8.c index 693fd19e3597..86b732cf4ef8 100644 --- a/src/bf16-vunary/gen/bf16-vabs-neonbf16-u8.c +++ b/src/bf16-vunary/gen/bf16-vabs-neonbf16-u8.c @@ -19,7 +19,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u8( size_t batch, const void* input, void* output, - const union xnn_bf16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_bf16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(bfloat16_t) == 0); @@ -28,7 +28,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u8( const bfloat16_t* i = (const bfloat16_t*) input; bfloat16_t* o = (bfloat16_t*) output; - uint16x8_t vmask = vld1q_u16(params->neon.nonsign_mask); + uint16x8_t vmask = vdupq_n_u16(0x7FFF); for (; batch >= 8 * sizeof(bfloat16_t); batch -= 8 * sizeof(bfloat16_t)) { const bfloat16x8_t vx01234567 = vld1q_bf16(i); i+= 8; diff --git a/src/bf16-vunary/neon.c.in b/src/bf16-vunary/neon.c.in index 88132e0a0c47..a551ca5799da 100644 --- a/src/bf16-vunary/neon.c.in +++ b/src/bf16-vunary/neon.c.in @@ -15,15 +15,11 @@ $assert OP in ["ABS", "NEG", "SQR"] #include "xnnpack/vunary.h" -$PARAMS = { -$ "ABS": "xnn_bf16_abs_params", -$ "NEG": "xnn_bf16_neg_params", -$}[OP] void xnn_bf16_v${OP.lower()}_ukernel__neonbf16_u${BATCH_TILE}( size_t batch, const void* input, void* output, - const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_bf16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(bfloat16_t) == 0); @@ -32,7 +28,7 @@ void xnn_bf16_v${OP.lower()}_ukernel__neonbf16_u${BATCH_TILE}( const bfloat16_t* i = (const bfloat16_t*) input; bfloat16_t* o = (bfloat16_t*) output; - uint16x8_t vmask = vld1q_u16(params->neon.nonsign_mask); + uint16x8_t vmask = vdupq_n_u16(0x7FFF); for (; batch >= ${BATCH_TILE} * sizeof(bfloat16_t); batch -= ${BATCH_TILE} * sizeof(bfloat16_t)) { $for N in range(0, BATCH_TILE, 8): const bfloat16x8_t vx${ABC[N:N+8]} = vld1q_bf16(i); i+= 8; diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 84107f01fd5d..cdd45969c196 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -200,7 +200,6 @@ static void init_f16_abs_config(void) { } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE f16_abs_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vabs_ukernel__sse2_u16; - f16_abs_config.init.f16_abs = xnn_init_f16_abs_sse_params; f16_abs_config.element_tile = 16; #endif } @@ -334,7 +333,6 @@ static void init_f16_neg_config(void) { } #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE f16_neg_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_vneg_ukernel__sse2_u16; - f16_neg_config.init.f16_neg = xnn_init_f16_neg_sse_params; f16_neg_config.element_tile = 16; #endif } @@ -582,12 +580,10 @@ static void init_f16_to_f32_cvt_config(void) { f16_to_f32_cvt_config.element_tile = 16; } else { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__neon_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params; f16_to_f32_cvt_config.element_tile = 16; } } else if (!XNN_PLATFORM_MOBILE) { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 4; } #elif XNN_ARCH_ARM64 @@ -604,38 +600,30 @@ static void init_f16_to_f32_cvt_config(void) { f16_to_f32_cvt_config.element_tile = 16; } else if (hardware_config->use_x86_avx) { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__avx_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params; f16_to_f32_cvt_config.element_tile = 16; } else if (hardware_config->use_x86_sse4_1) { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse41_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params; f16_to_f32_cvt_config.element_tile = 16; } else { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse2_int16_u32; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params; f16_to_f32_cvt_config.element_tile = 32; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params; f16_to_f32_cvt_config.element_tile = 16; #else f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params; f16_to_f32_cvt_config.element_tile = 16; #endif #elif XNN_ARCH_WASM f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u1; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 1; #elif XNN_ARCH_RISCV f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 4; #else f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 4; #endif } diff --git a/src/f16-f32-vcvt/avx512skx.c.in b/src/f16-f32-vcvt/avx512skx.c.in index 7dad8050b752..e39daf5fd0cd 100644 --- a/src/f16-f32-vcvt/avx512skx.c.in +++ b/src/f16-f32-vcvt/avx512skx.c.in @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/f16c.c.in b/src/f16-f32-vcvt/f16c.c.in index 6375c5f46c97..2ffade5606f7 100644 --- a/src/f16-f32-vcvt/f16c.c.in +++ b/src/f16-f32-vcvt/f16c.c.in @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c index a37cb4b733af..13424f9e1cd3 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c index 561d6da2d7cb..fe06ea1cd48b 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c index 6ac769e6870e..ada378941fff 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c index e7e6c584659f..862210904a74 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c index 9d48308f9ee4..ec7e4f63cd89 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c index 889993fce68b..a23fb53a07d9 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c index 306b12c3696d..0659d0978475 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c index a99b3ef5e4e8..c7515ce31c11 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c index aa5238944701..a05298e5d170 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c @@ -20,7 +20,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c index 0fa2a24f0208..a38276047c08 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c @@ -20,7 +20,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c index fecce772f307..0910270851fb 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c index 93e3316b41d0..a0de07669a06 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c index d384a26d592f..db1873f133ef 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16( const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c index 91a53e3c2634..71baf2d7d935 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u24( const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c index 9fece0562a86..50ce55ee0ffd 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u32( const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c index 295ede8d5186..b0a1d056e19b 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u8( const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c index 5daba5e9dc09..6c7662f32b1e 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u16( const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c index 5cc267a8976e..6a72148505e1 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u24( const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c index ace982a95239..f92a7052850a 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u32( const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c index 37d277e74f95..a7e701f5f43b 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u8( const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c index a2c83b744753..34e13651cf39 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c index 7c4c34a4a32f..2529624c243c 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c index b167df241c62..25b8e890c9e9 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u1( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c index 672ee8bac2cc..ccc58a275857 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u2( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c index 6a4a68ae0270..d07550c6ddd4 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u3( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c index bd72fb9d5475..d667076a4285 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u4( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c index 8518a3941b71..b2da88d2ddc5 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c index 8618f310628d..632bc9c9c5e3 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c index 38bdb678ef83..f9d5f7a9e9e8 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c index deef2ecef330..902f97c19369 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c index 685f637539ae..6c822372f2e2 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c index c1853e9a3ca7..00e868f4e5cd 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c index 9fec9aa79644..15a68ffe4e81 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c index 9b8737bec203..12fa6abe8324 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c index 086f2c24c333..f78d23665fec 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c index 3c1df5bc4ae0..0a1022398a1d 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c index 611ffe3d93ef..001220617914 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c index a19fd62ca64b..1bad3a4a8b09 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c index 54a732ef0c54..b8bbbbda483d 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c index 907be68c46f6..6e34976d566a 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c index d81a4a08d7af..5ea00049085d 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c index f357a1c4f4da..3ab912518b55 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c index 0c1c9dffdcd7..aca3fba771e9 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c index 8300a2a5629a..fde2c6bb8eb1 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c index 327a314af225..dc8f3d69420d 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c index 67621cae33c2..9a1853b8e717 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c index f5af2d4b0cb4..dd1da6754722 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c index 3ed10c69f7ba..04e97677337e 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c index 9044b77c7639..29f830f16939 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c index 3a4ebf19b121..93df26719dde 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c index 1b83337cad79..c285288b0760 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c index ff44bcfb93e4..444ddb1b9d1f 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c index c32049f94de8..97bc72a3b41d 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c index c4cd07bb56d8..9e3e36dfa350 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c @@ -19,19 +19,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c index 74b0d2b3dd4e..38f495363eeb 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c index 6040dd0af75d..db3f10a84963 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c index 95e914da203b..caab19cc07bc 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c index aeff3ee38cc4..d6e03f43d3b4 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c @@ -19,18 +19,18 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/neon-int16.c.in b/src/f16-f32-vcvt/neon-int16.c.in index 711e9fa7d0c4..456ad31f6399 100644 --- a/src/f16-f32-vcvt/neon-int16.c.in +++ b/src/f16-f32-vcvt/neon-int16.c.in @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u${BATCH_TILE}( const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); diff --git a/src/f16-f32-vcvt/neon-int32.c.in b/src/f16-f32-vcvt/neon-int32.c.in index 159cc0dbf8e1..4fec035a2c47 100644 --- a/src/f16-f32-vcvt/neon-int32.c.in +++ b/src/f16-f32-vcvt/neon-int32.c.in @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u${BATCH_TILE}( const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); diff --git a/src/f16-f32-vcvt/neonfp16.c.in b/src/f16-f32-vcvt/neonfp16.c.in index bef53842ac99..9ece4d211cf4 100644 --- a/src/f16-f32-vcvt/neonfp16.c.in +++ b/src/f16-f32-vcvt/neonfp16.c.in @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/scalar.c.in b/src/f16-f32-vcvt/scalar.c.in index 00021adca45c..cf4d330b8f72 100644 --- a/src/f16-f32-vcvt/scalar.c.in +++ b/src/f16-f32-vcvt/scalar.c.in @@ -16,19 +16,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/sse-int16.c.in b/src/f16-f32-vcvt/sse-int16.c.in index 830c12e874a6..e8bfd30d0677 100644 --- a/src/f16-f32-vcvt/sse-int16.c.in +++ b/src/f16-f32-vcvt/sse-int16.c.in @@ -23,19 +23,19 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); + const __m128i vexp_offset = _mm_set1_epi16(0x7000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(0x3F00); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/sse-int32.c.in b/src/f16-f32-vcvt/sse-int32.c.in index 7f969410c981..2159527807ca 100644 --- a/src/f16-f32-vcvt/sse-int32.c.in +++ b/src/f16-f32-vcvt/sse-int32.c.in @@ -23,18 +23,18 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int32_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(0x80000000); + const __m128i vexp_offset = _mm_set1_epi32(0x70000000); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(0x3F000000); + const __m128i vdenorm_cutoff = _mm_set1_epi32(0x08000000); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/wasmsimd-int16.c.in b/src/f16-f32-vcvt/wasmsimd-int16.c.in index e1e7307a97ef..3d78b024a1ca 100644 --- a/src/f16-f32-vcvt/wasmsimd-int16.c.in +++ b/src/f16-f32-vcvt/wasmsimd-int16.c.in @@ -21,19 +21,19 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000); + const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/wasmsimd-int32.c.in b/src/f16-f32-vcvt/wasmsimd-int32.c.in index d133a1e3c6df..f9d20f2d1fee 100644 --- a/src/f16-f32-vcvt/wasmsimd-int32.c.in +++ b/src/f16-f32-vcvt/wasmsimd-int32.c.in @@ -21,18 +21,18 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int32_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(0x80000000); + const v128_t vexp_offset = wasm_i32x4_const_splat(0x70000000); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(0x08000000); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-vunary/gen/f16-vabs-neonfp16arith-u16.c b/src/f16-vunary/gen/f16-vabs-neonfp16arith-u16.c index b71b7eae4a9d..b18b79925e6f 100644 --- a/src/f16-vunary/gen/f16-vabs-neonfp16arith-u16.c +++ b/src/f16-vunary/gen/f16-vabs-neonfp16arith-u16.c @@ -19,7 +19,7 @@ void xnn_f16_vabs_ukernel__neonfp16arith_u16( size_t batch, const void* input, void* output, - const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-vunary/gen/f16-vabs-neonfp16arith-u8.c b/src/f16-vunary/gen/f16-vabs-neonfp16arith-u8.c index 13341a78b248..9d8543379489 100644 --- a/src/f16-vunary/gen/f16-vabs-neonfp16arith-u8.c +++ b/src/f16-vunary/gen/f16-vabs-neonfp16arith-u8.c @@ -19,7 +19,7 @@ void xnn_f16_vabs_ukernel__neonfp16arith_u8( size_t batch, const void* input, void* output, - const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-vunary/gen/f16-vabs-sse2-u16.c b/src/f16-vunary/gen/f16-vabs-sse2-u16.c index 8e7856d4cf82..547c56f69d88 100644 --- a/src/f16-vunary/gen/f16-vabs-sse2-u16.c +++ b/src/f16-vunary/gen/f16-vabs-sse2-u16.c @@ -21,7 +21,7 @@ void xnn_f16_vabs_ukernel__sse2_u16( size_t batch, const void* input, void* output, - const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -30,7 +30,7 @@ void xnn_f16_vabs_ukernel__sse2_u16( const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; - const __m128i vnonsign_mask = _mm_load_si128((const __m128i*) params->sse.nonsign_mask); + const __m128i vnonsign_mask = _mm_set1_epi16(0x7FFF); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { __m128i vacc0 = _mm_loadu_si128((const __m128i*) i); __m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8)); diff --git a/src/f16-vunary/gen/f16-vabs-sse2-u8.c b/src/f16-vunary/gen/f16-vabs-sse2-u8.c index 68eaaab8405f..dc0dc82e4a40 100644 --- a/src/f16-vunary/gen/f16-vabs-sse2-u8.c +++ b/src/f16-vunary/gen/f16-vabs-sse2-u8.c @@ -21,7 +21,7 @@ void xnn_f16_vabs_ukernel__sse2_u8( size_t batch, const void* input, void* output, - const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -30,7 +30,7 @@ void xnn_f16_vabs_ukernel__sse2_u8( const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; - const __m128i vnonsign_mask = _mm_load_si128((const __m128i*) params->sse.nonsign_mask); + const __m128i vnonsign_mask = _mm_set1_epi16(0x7FFF); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { __m128i vacc = _mm_loadu_si128((const __m128i*) i); i += 8; diff --git a/src/f16-vunary/gen/f16-vneg-neonfp16arith-u16.c b/src/f16-vunary/gen/f16-vneg-neonfp16arith-u16.c index ebd858c47a0d..08df565810ae 100644 --- a/src/f16-vunary/gen/f16-vneg-neonfp16arith-u16.c +++ b/src/f16-vunary/gen/f16-vneg-neonfp16arith-u16.c @@ -19,7 +19,7 @@ void xnn_f16_vneg_ukernel__neonfp16arith_u16( size_t batch, const void* input, void* output, - const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-vunary/gen/f16-vneg-neonfp16arith-u8.c b/src/f16-vunary/gen/f16-vneg-neonfp16arith-u8.c index 8ff16e72b9cc..1e072952a9d7 100644 --- a/src/f16-vunary/gen/f16-vneg-neonfp16arith-u8.c +++ b/src/f16-vunary/gen/f16-vneg-neonfp16arith-u8.c @@ -19,7 +19,7 @@ void xnn_f16_vneg_ukernel__neonfp16arith_u8( size_t batch, const void* input, void* output, - const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-vunary/gen/f16-vneg-sse2-u16.c b/src/f16-vunary/gen/f16-vneg-sse2-u16.c index 297fc3877ab3..7819def8e040 100644 --- a/src/f16-vunary/gen/f16-vneg-sse2-u16.c +++ b/src/f16-vunary/gen/f16-vneg-sse2-u16.c @@ -21,7 +21,7 @@ void xnn_f16_vneg_ukernel__sse2_u16( size_t batch, const void* input, void* output, - const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -30,7 +30,7 @@ void xnn_f16_vneg_ukernel__sse2_u16( const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse.sign_mask); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { __m128i vacc0 = _mm_loadu_si128((const __m128i*) i); __m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8)); diff --git a/src/f16-vunary/gen/f16-vneg-sse2-u8.c b/src/f16-vunary/gen/f16-vneg-sse2-u8.c index 7ce68926de89..c413f0c8dc51 100644 --- a/src/f16-vunary/gen/f16-vneg-sse2-u8.c +++ b/src/f16-vunary/gen/f16-vneg-sse2-u8.c @@ -21,7 +21,7 @@ void xnn_f16_vneg_ukernel__sse2_u8( size_t batch, const void* input, void* output, - const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -30,7 +30,7 @@ void xnn_f16_vneg_ukernel__sse2_u8( const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse.sign_mask); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { __m128i vacc = _mm_loadu_si128((const __m128i*) i); i += 8; diff --git a/src/f16-vunary/neonfp16arith.c.in b/src/f16-vunary/neonfp16arith.c.in index 0797bcd43cce..4131210342aa 100644 --- a/src/f16-vunary/neonfp16arith.c.in +++ b/src/f16-vunary/neonfp16arith.c.in @@ -21,16 +21,11 @@ $ "ABS": lambda x: "vabsq_f16(%s)" % x, $ "NEG": lambda x: "vnegq_f16(%s)" % x, $ "SQR": lambda x: "vmulq_f16(%s, %s)" % (x, x), $}[OP] -$PARAMS = { -$ "ABS": "xnn_f16_abs_params", -$ "NEG": "xnn_f16_neg_params", -$ "SQR": "xnn_f16_default_params", -$}[OP] void xnn_f16_v${OP.lower()}_ukernel__neonfp16arith_u${BATCH_TILE}( size_t batch, const void* input, void* output, - const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-vunary/sse2.c.in b/src/f16-vunary/sse2.c.in index 4f6f6ff69a32..fdd01e0183ce 100644 --- a/src/f16-vunary/sse2.c.in +++ b/src/f16-vunary/sse2.c.in @@ -22,15 +22,11 @@ $_MM_OP_SI128 = { $ "ABS": lambda x: "_mm_and_si128(%s, vnonsign_mask)" % x, $ "NEG": lambda x: "_mm_xor_si128(%s, vsign_mask)" % x, $}[OP] -$PARAMS = { -$ "ABS": "xnn_f16_abs_params", -$ "NEG": "xnn_f16_neg_params", -$}[OP] void xnn_f16_v${OP.lower()}_ukernel__sse2_u${BATCH_TILE}( size_t batch, const void* input, void* output, - const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); @@ -40,9 +36,9 @@ void xnn_f16_v${OP.lower()}_ukernel__sse2_u${BATCH_TILE}( const uint16_t* i = (const uint16_t*) input; uint16_t* o = (uint16_t*) output; $if OP == "ABS": - const __m128i vnonsign_mask = _mm_load_si128((const __m128i*) params->sse.nonsign_mask); + const __m128i vnonsign_mask = _mm_set1_epi16(0x7FFF); $elif OP == "NEG": - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse.sign_mask); + const __m128i vsign_mask = _mm_set1_epi16(0x8000); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(uint16_t); batch -= ${BATCH_TILE} * sizeof(uint16_t)) { __m128i vacc${ABC[0]} = _mm_loadu_si128((const __m128i*) i); diff --git a/src/microparams-init.c b/src/microparams-init.c index 87960d5027c5..130364512681 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -3656,40 +3656,6 @@ size_t xnn_init_f32_tanh_neon_expm1minus_rr1_p6h5_params( } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -size_t xnn_init_bf16_abs_neon_params( - union xnn_bf16_abs_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 8; i++) { - params->neon.nonsign_mask[i] = UINT16_C(0x7FFF); - } - return sizeof(params->neon); -} -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f16_abs_sse_params( - union xnn_f16_abs_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 8; i++) { - params->sse.nonsign_mask[i] = UINT16_C(0x7FFF); - } - return sizeof(params->sse); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f16_neg_sse_params( - union xnn_f16_neg_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 8; i++) { - params->sse.sign_mask[i] = UINT16_C(0x8000); - } - return sizeof(params->sse); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - #if XNN_ARCH_X86 || XNN_ARCH_X86_64 size_t xnn_init_f32_rnd_sse2_params( union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]) @@ -6997,101 +6963,6 @@ size_t xnn_init_qs8_mul_minmax_fp32_wasmsimd_params( } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -size_t xnn_init_f16_f32_cvt_scalar_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - params->scalar.sign_mask = UINT32_C(0x80000000); - params->scalar.exp_offset = UINT32_C(0x70000000); - params->scalar.exp_scale = 0x1.0p-112f; - params->scalar.magic_mask = UINT32_C(0x3F000000); - params->scalar.magic_bias = 0.5f; - params->scalar.denorm_cutoff = UINT32_C(0x08000000); - return sizeof(params->scalar); -} - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -size_t xnn_init_f16_f32_cvt_neon_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - params->neon.exp_scale = 0x1.0p-112f; - return sizeof(params->neon); -} -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f16_f32_cvt_sse_int16_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 8; i++) { - params->sse_int16.sign_mask[i] = UINT16_C(0x8000); - params->sse_int16.exp_offset[i] = UINT16_C(0x7000); - } - for (uint32_t i = 0; i < 4; i++) { - params->sse_int16.exp_scale[i] = 0x1.0p-112f; - } - for (uint32_t i = 0; i < 8; i++) { - params->sse_int16.magic_mask[i] = UINT16_C(0x3F00); - } - for (uint32_t i = 0; i < 4; i++) { - params->sse_int16.magic_bias[i] = 0.5f; - } - for (uint32_t i = 0; i < 8; i++) { - params->sse_int16.denorm_cutoff[i] = INT16_C(0x0400); - } - return sizeof(params->sse_int16); -} - -size_t xnn_init_f16_f32_cvt_sse_int32_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 4; i++) { - params->sse_int32.sign_mask[i] = UINT32_C(0x80000000); - params->sse_int32.exp_offset[i] = UINT32_C(0x70000000); - params->sse_int32.exp_scale[i] = 0x1.0p-112f; - params->sse_int32.magic_bias[i] = UINT32_C(0x3F000000); - params->sse_int32.denorm_cutoff[i] = INT32_C(0x04000000); - } - return sizeof(params->sse_int32); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -size_t xnn_init_f16_f32_cvt_wasmsimd_int16_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 4; i++) { - params->wasmsimd_int16.sign_mask[i] = UINT16_C(0x8000); - params->wasmsimd_int16.exp_offset[i] = UINT16_C(0x7000); - } - for (uint32_t i = 0; i < 2; i++) { - params->wasmsimd_int16.exp_scale[i] = 0x1.0p-112f; - } - for (uint32_t i = 0; i < 4; i++) { - params->wasmsimd_int16.magic_mask[i] = UINT16_C(0x3F00); - } - for (uint32_t i = 0; i < 2; i++) { - params->wasmsimd_int16.magic_bias[i] = 0.5f; - } - for (uint32_t i = 0; i < 4; i++) { - params->wasmsimd_int16.denorm_cutoff[i] = INT16_C(0x0400); - } - return sizeof(params->wasmsimd_int16); -} - -size_t xnn_init_f16_f32_cvt_wasmsimd_int32_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 2; i++) { - params->wasmsimd_int32.sign_mask[i] = UINT32_C(0x80000000); - params->wasmsimd_int32.exp_offset[i] = UINT32_C(0x70000000); - params->wasmsimd_int32.exp_scale[i] = 0x1.0p-112f; - params->wasmsimd_int32.magic_bias[i] = UINT32_C(0x3F000000); - params->wasmsimd_int32.denorm_cutoff[i] = INT32_C(0x04000000); - } - return sizeof(params->wasmsimd_int32); -} -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - size_t xnn_init_f32_f16_cvt_scalar_bitcast_params( union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)]) { diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c index c832df38c2ba..98dd756fc97c 100644 --- a/src/operators/unary-elementwise-nc.c +++ b/src/operators/unary-elementwise-nc.c @@ -257,9 +257,9 @@ enum xnn_status xnn_create_abs_nc_f16( { const struct xnn_unary_elementwise_config* f16_abs_config = xnn_init_f16_abs_config(); - union xnn_f16_abs_params params; - if XNN_LIKELY(f16_abs_config != NULL && f16_abs_config->init.f16_abs != NULL) { - f16_abs_config->init.f16_abs(¶ms); + union xnn_f16_default_params params; + if XNN_LIKELY(f16_abs_config != NULL && f16_abs_config->init.f16_default != NULL) { + f16_abs_config->init.f16_default(¶ms); } return create_unary_elementwise_nc( @@ -496,14 +496,9 @@ enum xnn_status xnn_create_convert_nc_f16_f32( { const struct xnn_unary_elementwise_config* f16_to_f32_cvt_config = xnn_init_f16_to_f32_cvt_config(); - union xnn_f16_f32_cvt_params params; - if (f16_to_f32_cvt_config != NULL && f16_to_f32_cvt_config->init.f16_f32_cvt != NULL) { - f16_to_f32_cvt_config->init.f16_f32_cvt(¶ms); - } - return create_unary_elementwise_nc( flags, f16_to_f32_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), + /*params=*/NULL, /*params_size=*/0, xnn_operator_type_convert_nc_f16_f32, convert_op_out); } @@ -1280,9 +1275,9 @@ enum xnn_status xnn_create_negate_nc_f16( { const struct xnn_unary_elementwise_config* f16_neg_config = xnn_init_f16_neg_config(); - union xnn_f16_neg_params params; - if XNN_LIKELY(f16_neg_config != NULL && f16_neg_config->init.f16_neg != NULL) { - f16_neg_config->init.f16_neg(¶ms); + union xnn_f16_default_params params; + if XNN_LIKELY(f16_neg_config != NULL && f16_neg_config->init.f16_default != NULL) { + f16_neg_config->init.f16_default(¶ms); } return create_unary_elementwise_nc( @@ -1499,7 +1494,7 @@ enum xnn_status xnn_reshape_abs_nc_f16( output_stride, /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &abs_op->params.f16_abs, sizeof(abs_op->params.f16_abs), + &abs_op->params.f16_default, sizeof(abs_op->params.f16_default), threadpool); } @@ -1689,7 +1684,7 @@ enum xnn_status xnn_reshape_convert_nc_f16_f32( channels, input_stride, output_stride, /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &convert_op->params.f16_f32_cvt, sizeof(convert_op->params.f16_f32_cvt), + /*params=*/NULL, /*params_size=*/0, threadpool); } @@ -2327,7 +2322,7 @@ enum xnn_status xnn_reshape_negate_nc_f16( channels, input_stride, output_stride, /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, /*log2_output_size=*/XNN_LOG2_SIZEOF_HALF, - &negate_op->params.f16_neg, sizeof(negate_op->params.f16_neg), + &negate_op->params.f16_default, sizeof(negate_op->params.f16_default), threadpool); } @@ -3373,16 +3368,11 @@ enum xnn_status xnn_run_convert_nc_f16_f32( { const struct xnn_unary_elementwise_config* f16_to_f32_cvt_config = xnn_init_f16_to_f32_cvt_config(); - union xnn_f16_f32_cvt_params params; - if XNN_LIKELY(f16_to_f32_cvt_config != NULL && f16_to_f32_cvt_config->init.f16_f32_cvt != NULL) { - f16_to_f32_cvt_config->init.f16_f32_cvt(¶ms); - } - return run_unary_elementwise_nc( xnn_operator_type_convert_nc_f16_f32, channels, input_stride, output_stride, batch_size, input, output, - f16_to_f32_cvt_config, ¶ms, sizeof(params), + f16_to_f32_cvt_config, /*params=*/NULL, /*params_size=*/0, /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, flags, diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h index 5f14ba388bca..465bb8c9e77d 100644 --- a/src/xnnpack/compute.h +++ b/src/xnnpack/compute.h @@ -1351,13 +1351,11 @@ struct univector_strided_context { size_t y_stride; xnn_vunary_ukernel_fn ukernel; union { - union xnn_f16_abs_params f16_abs; union xnn_f16_default_params f16_default; - union xnn_f16_f32_cvt_params f16_f32_cvt; union xnn_f16_hswish_params f16_hswish; union xnn_f16_lrelu_params f16_lrelu; union xnn_f16_minmax_params f16_minmax; - union xnn_f16_neg_params f16_neg; + union xnn_f16_default_params f16_neg; union xnn_f16_sigmoid_params f16_sigmoid; union xnn_f16_tanh_params f16_tanh; union xnn_f32_default_params f32_default; @@ -1400,13 +1398,10 @@ struct univector_contiguous_context { uint16_t log2_ysize; xnn_vunary_ukernel_fn ukernel; union { - union xnn_f16_abs_params f16_abs; union xnn_f16_default_params f16_default; - union xnn_f16_f32_cvt_params f16_f32_cvt; union xnn_f16_hswish_params f16_hswish; union xnn_f16_lrelu_params f16_lrelu; union xnn_f16_minmax_params f16_minmax; - union xnn_f16_neg_params f16_neg; union xnn_f16_sigmoid_params f16_sigmoid; union xnn_f32_default_params f32_default; union xnn_f32_elu_params f32_elu; diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index ffac8574c0c0..5f3679649635 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -87,13 +87,11 @@ struct xnn_binary_elementwise_config { struct xnn_unary_elementwise_config { xnn_vunary_ukernel_fn ukernel; union { - xnn_init_f16_f32_cvt_params_fn f16_f32_cvt; xnn_init_f16_qs8_cvt_params_fn f16_qs8_cvt; - xnn_init_f16_abs_params_fn f16_abs; + xnn_init_f16_default_params_fn f16_default; xnn_init_f16_elu_params_fn f16_elu; xnn_init_f16_hswish_params_fn f16_hswish; xnn_init_f16_lrelu_params_fn f16_lrelu; - xnn_init_f16_neg_params_fn f16_neg; xnn_init_f16_minmax_params_fn f16_minmax; xnn_init_f16_rsqrt_params_fn f16_rsqrt; xnn_init_f16_sigmoid_params_fn f16_sigmoid; diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 13057af3d473..9e6190c80c7f 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1764,13 +1764,13 @@ typedef void (*xnn_bf16_vabs_ukernel_fn)( size_t batch, const void* input, void* output, - const union xnn_bf16_abs_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + const union xnn_bf16_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); typedef void (*xnn_f16_vabs_ukernel_fn)( size_t batch, const void* input, void* output, - const union xnn_f16_abs_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + const union xnn_f16_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); typedef void (*xnn_f32_vabs_ukernel_fn)( size_t batch, @@ -1830,7 +1830,7 @@ typedef void (*xnn_f16_f32_vcvt_ukernel_fn)( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + const void* params); typedef void (*xnn_f16_qs8_vcvt_ukernel_fn)( size_t batch, @@ -1972,7 +1972,7 @@ typedef void (*xnn_f16_vneg_ukernel_fn)( size_t batch, const void* input, void* output, - const union xnn_f16_neg_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + const union xnn_f16_default_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); typedef void (*xnn_f32_vneg_ukernel_fn)( size_t batch, @@ -2518,9 +2518,6 @@ typedef void (*xnn_f32_vscaleextexp_ukernel_fn)( /***************** Microkernel parameter initializer pointers ****************/ -typedef size_t (*xnn_init_f16_f32_cvt_params_fn)( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]); - typedef size_t (*xnn_init_f16_qs8_cvt_params_fn)( union xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], uint16_t scale, @@ -2670,11 +2667,8 @@ typedef size_t (*xnn_init_qu8_mul_minmax_params_fn)( uint8_t output_min, uint8_t output_max); -typedef size_t (*xnn_init_bf16_abs_params_fn)( - union xnn_bf16_abs_params params[XNN_MIN_ELEMENTS(1)]); - -typedef size_t (*xnn_init_f16_abs_params_fn)( - union xnn_f16_abs_params params[XNN_MIN_ELEMENTS(1)]); +typedef size_t (*xnn_init_bf16_default_params_fn)( + union xnn_bf16_default_params params[XNN_MIN_ELEMENTS(1)]); typedef size_t (*xnn_init_f16_default_params_fn)( union xnn_f16_default_params params[XNN_MIN_ELEMENTS(1)]); @@ -2804,9 +2798,6 @@ typedef size_t (*xnn_init_u8_minmax_params_fn)( uint8_t min, uint8_t max); -typedef size_t (*xnn_init_f16_neg_params_fn)( - union xnn_f16_neg_params params[XNN_MIN_ELEMENTS(1)]); - typedef size_t (*xnn_init_f16_rnd_params_fn)( union xnn_f16_rnd_params params[XNN_MIN_ELEMENTS(1)]); diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index c9c48af45639..649c09a56ed9 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -648,32 +648,22 @@ DECLARE_INIT_F32_TANH_PARAMS_FUNCTION(xnn_init_f32_tanh_scalar_expm1minus_rr1_p6 DECLARE_INIT_F32_TANH_PARAMS_FUNCTION(xnn_init_f32_tanh_neon_expm1minus_rr1_p6h5_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#define DECLARE_INIT_BF16_ABS_PARAMS_FUNCTION(fn_name) \ +#define DECLARE_INIT_bf16_default_params_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ - union xnn_bf16_abs_params params[XNN_MIN_ELEMENTS(1)]); + union xnn_bf16_default_params params[XNN_MIN_ELEMENTS(1)]); #if XNN_ARCH_ARM || XNN_ARCH_ARM64 - DECLARE_INIT_BF16_ABS_PARAMS_FUNCTION(xnn_init_bf16_abs_neon_params) + DECLARE_INIT_bf16_default_params_FUNCTION(xnn_init_bf16_abs_neon_params) #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 #define DECLARE_INIT_F16_ABS_PARAMS_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ - union xnn_f16_abs_params params[XNN_MIN_ELEMENTS(1)]); + union xnn_f16_default_params params[XNN_MIN_ELEMENTS(1)]); #if XNN_ARCH_X86 || XNN_ARCH_X86_64 DECLARE_INIT_F16_ABS_PARAMS_FUNCTION(xnn_init_f16_abs_sse_params) #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#define DECLARE_INIT_F16_NEG_PARAMS_FUNCTION(fn_name) \ - XNN_INTERNAL size_t fn_name( \ - union xnn_f16_neg_params params[XNN_MIN_ELEMENTS(1)]); - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F16_NEG_PARAMS_FUNCTION(xnn_init_f16_neg_sse_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - #define DECLARE_INIT_F32_RND_PARAMS_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)]); @@ -1086,24 +1076,6 @@ DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qu8_mul_minmax_fp32_scalar_ #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#define DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(fn_name) \ - XNN_INTERNAL size_t fn_name( \ - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]); - -DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_scalar_params) -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_neon_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_sse_int16_params) - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_sse_int32_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_wasmsimd_int16_params) - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_wasmsimd_int32_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - #define DECLARE_INIT_F32_F16_CVT_PARAMS_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)]); diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index 8ab86e02fe73..c144ddf78c4b 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -17,6 +17,10 @@ union xnn_f16_default_params { char _; // Dummy member variable to comply with the C standard }; +union xnn_bf16_default_params { + char _; // Dummy member variable to comply with the C standard +}; + union xnn_f32_default_params { char _; // Dummy member variable to comply with the C standard #if XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1263,79 +1267,8 @@ union xnn_qu8_avgpool_minmax_params { }; -// Abs: used by VABS microkernels. - -union xnn_bf16_abs_params { - char _; // Dummy member variable to comply with the C standard -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - struct { - XNN_ALIGN(16) uint16_t nonsign_mask[8]; - } neon; -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -}; - -union xnn_f16_abs_params { - char _; // Dummy member variable to comply with the C standard -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) uint16_t nonsign_mask[8]; - } sse; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -}; - - // Cvt (Convert): used by VCVT microkernels. -union xnn_f16_f32_cvt_params { - struct { - uint32_t sign_mask; - uint32_t exp_offset; - float exp_scale; - uint32_t magic_mask; - float magic_bias; - uint32_t denorm_cutoff; - } scalar; -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - struct { - float exp_scale; - } neon; -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) uint16_t sign_mask[8]; - XNN_ALIGN(16) uint16_t exp_offset[8]; - XNN_ALIGN(16) float exp_scale[4]; - XNN_ALIGN(16) uint16_t magic_mask[8]; - XNN_ALIGN(16) float magic_bias[4]; - XNN_ALIGN(16) int16_t denorm_cutoff[8]; - } sse_int16; - struct { - XNN_ALIGN(16) uint32_t sign_mask[4]; - XNN_ALIGN(16) uint32_t exp_offset[4]; - XNN_ALIGN(16) float exp_scale[4]; - XNN_ALIGN(16) uint32_t magic_bias[4]; - XNN_ALIGN(16) int32_t denorm_cutoff[4]; - } sse_int32; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - struct { - XNN_ALIGN(8) uint16_t sign_mask[4]; - XNN_ALIGN(8) uint16_t exp_offset[4]; - XNN_ALIGN(8) float exp_scale[2]; - XNN_ALIGN(8) uint16_t magic_mask[4]; - XNN_ALIGN(8) float magic_bias[2]; - XNN_ALIGN(8) int16_t denorm_cutoff[4]; - } wasmsimd_int16; - struct { - XNN_ALIGN(8) uint32_t sign_mask[2]; - XNN_ALIGN(8) uint32_t exp_offset[2]; - XNN_ALIGN(8) float exp_scale[2]; - XNN_ALIGN(8) uint32_t magic_bias[2]; - XNN_ALIGN(8) int32_t denorm_cutoff[2]; - } wasmsimd_int32; -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -}; - union xnn_f32_f16_cvt_params { struct { uint32_t nonsign_mask; @@ -2610,19 +2543,6 @@ union xnn_qu8_lrelu_params { #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD }; - -// Neg: used by VNEG microkernels. - -union xnn_f16_neg_params { - char _; // Dummy member variable to comply with the C standard -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) uint16_t sign_mask[8]; - } sse; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -}; - - // Rnd (Round): used by VRNDNE/VRNDU/VRNDD/VRNDZ microkernels. union xnn_f16_rnd_params { diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index f2041f12bd5f..f4cf5923432c 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -225,13 +225,10 @@ struct xnn_operator { uint32_t flags; union { - union xnn_f16_abs_params f16_abs; union xnn_f16_default_params f16_default; - union xnn_f16_f32_cvt_params f16_f32_cvt; union xnn_f16_hswish_params f16_hswish; union xnn_f16_elu_params f16_elu; union xnn_f16_lrelu_params f16_lrelu; - union xnn_f16_neg_params f16_neg; union xnn_f16_sigmoid_params f16_sigmoid; union xnn_f16_tanh_params f16_tanh; union xnn_f32_default_params f32_default; diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h index 4e66f1cda6dd..f526983c3eae 100644 --- a/src/xnnpack/vcvt.h +++ b/src/xnnpack/vcvt.h @@ -21,7 +21,7 @@ extern "C" { size_t n, \ const void* input, \ float* output, \ - const union xnn_f16_f32_cvt_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + const void* params); DECLARE_F16_F32_VCVT_UKERNEL_FUNCTION(xnn_f16_f32_vcvt_ukernel__neon_int16_u8) DECLARE_F16_F32_VCVT_UKERNEL_FUNCTION(xnn_f16_f32_vcvt_ukernel__neon_int16_u16) diff --git a/src/xnnpack/vunary.h b/src/xnnpack/vunary.h index c778ad76e3db..b328e4f7b716 100644 --- a/src/xnnpack/vunary.h +++ b/src/xnnpack/vunary.h @@ -38,13 +38,13 @@ extern "C" { const union params_union params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); -#define DECLARE_BF16_VABS_UKERNEL_FUNCTION(fn_name) DECLARE_BF16_UKERNEL_FUNCTION(fn_name, xnn_bf16_abs_params); +#define DECLARE_BF16_VABS_UKERNEL_FUNCTION(fn_name) DECLARE_BF16_UKERNEL_FUNCTION(fn_name, xnn_bf16_default_params); DECLARE_BF16_VABS_UKERNEL_FUNCTION(xnn_bf16_vabs_ukernel__neonbf16_u8) DECLARE_BF16_VABS_UKERNEL_FUNCTION(xnn_bf16_vabs_ukernel__neonbf16_u16) DECLARE_BF16_VABS_UKERNEL_FUNCTION(xnn_bf16_vabs_ukernel__neonbf16_u24) -#define DECLARE_F16_VABS_UKERNEL_FUNCTION(fn_name) DECLARE_F16_UKERNEL_FUNCTION(fn_name, xnn_f16_abs_params); +#define DECLARE_F16_VABS_UKERNEL_FUNCTION(fn_name) DECLARE_F16_UKERNEL_FUNCTION(fn_name, xnn_f16_default_params); DECLARE_F16_VABS_UKERNEL_FUNCTION(xnn_f16_vabs_ukernel__neonfp16arith_u8) DECLARE_F16_VABS_UKERNEL_FUNCTION(xnn_f16_vabs_ukernel__neonfp16arith_u16) @@ -403,7 +403,7 @@ DECLARE_F16_VHSWISH_UKERNEL_FUNCTION(xnn_f16_vhswish_ukernel__f16c_u8) DECLARE_F16_VHSWISH_UKERNEL_FUNCTION(xnn_f16_vhswish_ukernel__f16c_u16) -#define DECLARE_F16_VNEG_UKERNEL_FUNCTION(fn_name) DECLARE_F16_UKERNEL_FUNCTION(fn_name, xnn_f16_neg_params); +#define DECLARE_F16_VNEG_UKERNEL_FUNCTION(fn_name) DECLARE_F16_UKERNEL_FUNCTION(fn_name, xnn_f16_default_params); DECLARE_F16_VNEG_UKERNEL_FUNCTION(xnn_f16_vneg_ukernel__neonfp16arith_u8) DECLARE_F16_VNEG_UKERNEL_FUNCTION(xnn_f16_vneg_ukernel__neonfp16arith_u16) diff --git a/test/bf16-vabs.cc b/test/bf16-vabs.cc index db2007dbc42f..7b872aad406a 100644 --- a/test/bf16-vabs.cc +++ b/test/bf16-vabs.cc @@ -30,7 +30,7 @@ TEST_REQUIRES_ARM_NEON_BF16; VUnaryMicrokernelTester() .batch_size(8) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8); } TEST(BF16_VABS__NEONBF16_U8, batch_div_8) { @@ -39,7 +39,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8); } } @@ -49,7 +49,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8); } } @@ -59,7 +59,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8); } } @@ -70,7 +70,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u8); } } #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -81,7 +81,7 @@ TEST_REQUIRES_ARM_NEON_BF16; VUnaryMicrokernelTester() .batch_size(16) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16); } TEST(BF16_VABS__NEONBF16_U16, batch_div_16) { @@ -90,7 +90,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16); } } @@ -100,7 +100,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16); } } @@ -110,7 +110,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16); } } @@ -121,7 +121,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u16); } } #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) @@ -132,7 +132,7 @@ TEST_REQUIRES_ARM_NEON_BF16; VUnaryMicrokernelTester() .batch_size(24) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24); } TEST(BF16_VABS__NEONBF16_U24, batch_div_24) { @@ -141,7 +141,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24); } } @@ -151,7 +151,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24); } } @@ -161,7 +161,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24); } } @@ -172,7 +172,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24, xnn_init_bf16_abs_neon_params); + .TestAbs(xnn_bf16_vabs_ukernel__neonbf16_u24); } } #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) diff --git a/test/bf16-vabs.yaml b/test/bf16-vabs.yaml index dd6a60b44474..b9a20748f553 100644 --- a/test/bf16-vabs.yaml +++ b/test/bf16-vabs.yaml @@ -5,8 +5,5 @@ # ARM NEON+FP16ARITH - name: xnn_bf16_vabs_ukernel__neonbf16_u8 - init: xnn_init_bf16_abs_neon_params - name: xnn_bf16_vabs_ukernel__neonbf16_u16 - init: xnn_init_bf16_abs_neon_params - name: xnn_bf16_vabs_ukernel__neonbf16_u24 - init: xnn_init_bf16_abs_neon_params diff --git a/test/f16-f32-vcvt.cc b/test/f16-f32-vcvt.cc index 4935c9a82d64..94d9826ccd00 100644 --- a/test/f16-f32-vcvt.cc +++ b/test/f16-f32-vcvt.cc @@ -22,7 +22,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } TEST(F16_F32_VCVT__NEON_INT16_U8, batch_div_8) { @@ -30,7 +30,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } } @@ -39,7 +39,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } } @@ -48,7 +48,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -59,7 +59,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } TEST(F16_F32_VCVT__NEON_INT16_U16, batch_div_16) { @@ -67,7 +67,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } } @@ -76,7 +76,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } } @@ -85,7 +85,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -96,7 +96,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } TEST(F16_F32_VCVT__NEON_INT16_U24, batch_div_24) { @@ -104,7 +104,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } } @@ -113,7 +113,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } } @@ -122,7 +122,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -133,7 +133,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } TEST(F16_F32_VCVT__NEON_INT16_U32, batch_div_32) { @@ -141,7 +141,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } } @@ -150,7 +150,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } } @@ -159,7 +159,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -170,7 +170,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } TEST(F16_F32_VCVT__NEON_INT32_U8, batch_div_8) { @@ -178,7 +178,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } } @@ -187,7 +187,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } } @@ -196,7 +196,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -207,7 +207,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } TEST(F16_F32_VCVT__NEON_INT32_U16, batch_div_16) { @@ -215,7 +215,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } } @@ -224,7 +224,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } } @@ -233,7 +233,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -244,7 +244,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } TEST(F16_F32_VCVT__NEON_INT32_U24, batch_div_24) { @@ -252,7 +252,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } } @@ -261,7 +261,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } } @@ -270,7 +270,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -281,7 +281,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } TEST(F16_F32_VCVT__NEON_INT32_U32, batch_div_32) { @@ -289,7 +289,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } } @@ -298,7 +298,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } } @@ -307,7 +307,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -392,7 +392,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } TEST(F16_F32_VCVT__SSE2_INT16_U8, batch_div_8) { @@ -400,7 +400,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } } @@ -409,7 +409,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } } @@ -418,7 +418,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -429,7 +429,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } TEST(F16_F32_VCVT__SSE2_INT16_U16, batch_div_16) { @@ -437,7 +437,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } } @@ -446,7 +446,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } } @@ -455,7 +455,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -466,7 +466,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } TEST(F16_F32_VCVT__SSE2_INT16_U24, batch_div_24) { @@ -474,7 +474,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } } @@ -483,7 +483,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } } @@ -492,7 +492,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -503,7 +503,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } TEST(F16_F32_VCVT__SSE2_INT16_U32, batch_div_32) { @@ -511,7 +511,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } } @@ -520,7 +520,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } } @@ -529,7 +529,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -540,7 +540,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } TEST(F16_F32_VCVT__SSE2_INT32_U8, batch_div_8) { @@ -548,7 +548,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } } @@ -557,7 +557,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } } @@ -566,7 +566,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -577,7 +577,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } TEST(F16_F32_VCVT__SSE2_INT32_U16, batch_div_16) { @@ -585,7 +585,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } } @@ -594,7 +594,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } } @@ -603,7 +603,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -614,7 +614,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } TEST(F16_F32_VCVT__SSE2_INT32_U24, batch_div_24) { @@ -622,7 +622,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } } @@ -631,7 +631,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } } @@ -640,7 +640,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -651,7 +651,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } TEST(F16_F32_VCVT__SSE2_INT32_U32, batch_div_32) { @@ -659,7 +659,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } } @@ -668,7 +668,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } } @@ -677,7 +677,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -688,7 +688,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } TEST(F16_F32_VCVT__SSE41_INT16_U8, batch_div_8) { @@ -696,7 +696,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } } @@ -705,7 +705,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } } @@ -714,7 +714,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -725,7 +725,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } TEST(F16_F32_VCVT__SSE41_INT16_U16, batch_div_16) { @@ -733,7 +733,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } } @@ -742,7 +742,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } } @@ -751,7 +751,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -762,7 +762,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } TEST(F16_F32_VCVT__SSE41_INT16_U24, batch_div_24) { @@ -770,7 +770,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } } @@ -779,7 +779,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } } @@ -788,7 +788,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -799,7 +799,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } TEST(F16_F32_VCVT__SSE41_INT16_U32, batch_div_32) { @@ -807,7 +807,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } } @@ -816,7 +816,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } } @@ -825,7 +825,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -836,7 +836,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } TEST(F16_F32_VCVT__SSE41_INT32_U8, batch_div_8) { @@ -844,7 +844,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } } @@ -853,7 +853,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } } @@ -862,7 +862,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -873,7 +873,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } TEST(F16_F32_VCVT__SSE41_INT32_U16, batch_div_16) { @@ -881,7 +881,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } } @@ -890,7 +890,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } } @@ -899,7 +899,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -910,7 +910,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } TEST(F16_F32_VCVT__SSE41_INT32_U24, batch_div_24) { @@ -918,7 +918,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } } @@ -927,7 +927,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } } @@ -936,7 +936,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -947,7 +947,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } TEST(F16_F32_VCVT__SSE41_INT32_U32, batch_div_32) { @@ -955,7 +955,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } } @@ -964,7 +964,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } } @@ -973,7 +973,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -984,7 +984,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } TEST(F16_F32_VCVT__AVX_INT16_U8, batch_div_8) { @@ -992,7 +992,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } } @@ -1001,7 +1001,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } } @@ -1010,7 +1010,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1021,7 +1021,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } TEST(F16_F32_VCVT__AVX_INT16_U16, batch_div_16) { @@ -1029,7 +1029,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } } @@ -1038,7 +1038,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } } @@ -1047,7 +1047,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1058,7 +1058,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } TEST(F16_F32_VCVT__AVX_INT16_U24, batch_div_24) { @@ -1066,7 +1066,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } } @@ -1075,7 +1075,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } } @@ -1084,7 +1084,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1095,7 +1095,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } TEST(F16_F32_VCVT__AVX_INT16_U32, batch_div_32) { @@ -1103,7 +1103,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } } @@ -1112,7 +1112,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } } @@ -1121,7 +1121,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1132,7 +1132,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } TEST(F16_F32_VCVT__AVX_INT32_U8, batch_div_8) { @@ -1140,7 +1140,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } } @@ -1149,7 +1149,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } } @@ -1158,7 +1158,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1169,7 +1169,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } TEST(F16_F32_VCVT__AVX_INT32_U16, batch_div_16) { @@ -1177,7 +1177,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } } @@ -1186,7 +1186,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } } @@ -1195,7 +1195,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1206,7 +1206,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } TEST(F16_F32_VCVT__AVX_INT32_U24, batch_div_24) { @@ -1214,7 +1214,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } } @@ -1223,7 +1223,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } } @@ -1232,7 +1232,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1243,7 +1243,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } TEST(F16_F32_VCVT__AVX_INT32_U32, batch_div_32) { @@ -1251,7 +1251,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } } @@ -1260,7 +1260,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } } @@ -1269,7 +1269,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1427,14 +1427,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } } @@ -1442,7 +1442,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } } @@ -1450,7 +1450,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1460,14 +1460,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } } @@ -1475,7 +1475,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } } @@ -1483,7 +1483,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1493,14 +1493,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } } @@ -1508,7 +1508,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } } @@ -1516,7 +1516,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1526,14 +1526,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } } @@ -1541,7 +1541,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } } @@ -1549,7 +1549,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1559,14 +1559,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } } @@ -1574,7 +1574,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } } @@ -1582,7 +1582,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1592,14 +1592,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } } @@ -1607,7 +1607,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } } @@ -1615,7 +1615,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1625,14 +1625,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } } @@ -1640,7 +1640,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } } @@ -1648,7 +1648,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1658,14 +1658,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } } @@ -1673,7 +1673,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } } @@ -1681,7 +1681,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1691,14 +1691,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } } @@ -1706,7 +1706,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } } @@ -1714,7 +1714,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1724,14 +1724,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } } @@ -1739,7 +1739,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } } @@ -1747,7 +1747,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1757,14 +1757,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } } @@ -1772,7 +1772,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } } @@ -1780,7 +1780,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1790,14 +1790,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } } @@ -1805,7 +1805,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } } @@ -1813,7 +1813,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1823,14 +1823,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } } @@ -1838,7 +1838,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } } @@ -1846,7 +1846,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1856,14 +1856,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } } @@ -1871,7 +1871,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } } @@ -1879,7 +1879,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1889,14 +1889,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } } @@ -1904,7 +1904,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } } @@ -1912,7 +1912,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1922,14 +1922,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } } @@ -1937,7 +1937,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } } @@ -1945,7 +1945,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1954,14 +1954,14 @@ TEST(F16_F32_VCVT__SCALAR_U1, batch_eq_1) { VCvtMicrokernelTester() .batch_size(1) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1); } TEST(F16_F32_VCVT__SCALAR_U1, batch_gt_1) { for (size_t batch_size = 2; batch_size < 10; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1); } } @@ -1969,14 +1969,14 @@ TEST(F16_F32_VCVT__SCALAR_U1, batch_gt_1) { TEST(F16_F32_VCVT__SCALAR_U2, batch_eq_2) { VCvtMicrokernelTester() .batch_size(2) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } TEST(F16_F32_VCVT__SCALAR_U2, batch_div_2) { for (size_t batch_size = 4; batch_size < 20; batch_size += 2) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } } @@ -1984,7 +1984,7 @@ TEST(F16_F32_VCVT__SCALAR_U2, batch_lt_2) { for (size_t batch_size = 1; batch_size < 2; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } } @@ -1992,7 +1992,7 @@ TEST(F16_F32_VCVT__SCALAR_U2, batch_gt_2) { for (size_t batch_size = 3; batch_size < 4; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } } @@ -2000,14 +2000,14 @@ TEST(F16_F32_VCVT__SCALAR_U2, batch_gt_2) { TEST(F16_F32_VCVT__SCALAR_U3, batch_eq_3) { VCvtMicrokernelTester() .batch_size(3) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } TEST(F16_F32_VCVT__SCALAR_U3, batch_div_3) { for (size_t batch_size = 6; batch_size < 30; batch_size += 3) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } } @@ -2015,7 +2015,7 @@ TEST(F16_F32_VCVT__SCALAR_U3, batch_lt_3) { for (size_t batch_size = 1; batch_size < 3; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } } @@ -2023,7 +2023,7 @@ TEST(F16_F32_VCVT__SCALAR_U3, batch_gt_3) { for (size_t batch_size = 4; batch_size < 6; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } } @@ -2031,14 +2031,14 @@ TEST(F16_F32_VCVT__SCALAR_U3, batch_gt_3) { TEST(F16_F32_VCVT__SCALAR_U4, batch_eq_4) { VCvtMicrokernelTester() .batch_size(4) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } TEST(F16_F32_VCVT__SCALAR_U4, batch_div_4) { for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } } @@ -2046,7 +2046,7 @@ TEST(F16_F32_VCVT__SCALAR_U4, batch_lt_4) { for (size_t batch_size = 1; batch_size < 4; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } } @@ -2054,6 +2054,6 @@ TEST(F16_F32_VCVT__SCALAR_U4, batch_gt_4) { for (size_t batch_size = 5; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } } diff --git a/test/f16-f32-vcvt.yaml b/test/f16-f32-vcvt.yaml index a714e99a4310..4d681fbcef5c 100644 --- a/test/f16-f32-vcvt.yaml +++ b/test/f16-f32-vcvt.yaml @@ -5,73 +5,41 @@ # ARM NEON+FP16ARITH - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u8 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u16 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u24 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u32 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u8 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u16 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u24 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u32 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neonfp16_u8 - name: xnn_f16_f32_vcvt_ukernel__neonfp16_u16 # x86 SSE - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u8 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u16 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u24 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u32 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u8 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u16 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u24 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u32 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u8 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u16 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u24 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u32 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u8 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u16 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u24 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u32 - init: xnn_init_f16_f32_cvt_sse_int32_params # x86 AVX - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u8 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u16 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u24 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u32 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u8 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u16 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u24 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u32 - init: xnn_init_f16_f32_cvt_sse_int32_params # x86 F16C - name: xnn_f16_f32_vcvt_ukernel__f16c_u8 - name: xnn_f16_f32_vcvt_ukernel__f16c_u16 @@ -80,44 +48,24 @@ - name: xnn_f16_f32_vcvt_ukernel__avx512skx_u32 # WAsm SIMD - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params # WAsm Relaxed SIMD - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params # Scalar - name: xnn_f16_f32_vcvt_ukernel__scalar_u1 - init: xnn_init_f16_f32_cvt_scalar_params - name: xnn_f16_f32_vcvt_ukernel__scalar_u2 - init: xnn_init_f16_f32_cvt_scalar_params - name: xnn_f16_f32_vcvt_ukernel__scalar_u3 - init: xnn_init_f16_f32_cvt_scalar_params - name: xnn_f16_f32_vcvt_ukernel__scalar_u4 - init: xnn_init_f16_f32_cvt_scalar_params diff --git a/test/f16-vabs.cc b/test/f16-vabs.cc index 74203f2b8835..c8bcc99152d6 100644 --- a/test/f16-vabs.cc +++ b/test/f16-vabs.cc @@ -132,7 +132,7 @@ TEST_REQUIRES_X86_SSE2; VUnaryMicrokernelTester() .batch_size(8) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u8, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u8); } TEST(F16_VABS__SSE2_U8, batch_div_8) { @@ -141,7 +141,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u8, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u8); } } @@ -151,7 +151,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u8, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u8); } } @@ -161,7 +161,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u8, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u8); } } @@ -172,7 +172,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u8, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -183,7 +183,7 @@ TEST_REQUIRES_X86_SSE2; VUnaryMicrokernelTester() .batch_size(16) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u16, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u16); } TEST(F16_VABS__SSE2_U16, batch_div_16) { @@ -192,7 +192,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u16, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u16); } } @@ -202,7 +202,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u16, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u16); } } @@ -212,7 +212,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u16, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u16); } } @@ -223,7 +223,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .TestAbs(xnn_f16_vabs_ukernel__sse2_u16, xnn_init_f16_abs_sse_params); + .TestAbs(xnn_f16_vabs_ukernel__sse2_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f16-vabs.yaml b/test/f16-vabs.yaml index 166c1edefe81..d5ac096fccdf 100644 --- a/test/f16-vabs.yaml +++ b/test/f16-vabs.yaml @@ -9,6 +9,4 @@ # x86 SSE2 - name: xnn_f16_vabs_ukernel__sse2_u8 - init: xnn_init_f16_abs_sse_params - name: xnn_f16_vabs_ukernel__sse2_u16 - init: xnn_init_f16_abs_sse_params diff --git a/test/f16-vneg.cc b/test/f16-vneg.cc index 14c7d4428985..262288cb24d2 100644 --- a/test/f16-vneg.cc +++ b/test/f16-vneg.cc @@ -132,7 +132,7 @@ TEST_REQUIRES_X86_SSE2; VUnaryMicrokernelTester() .batch_size(8) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u8, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u8); } TEST(F16_VNEG__SSE2_U8, batch_div_8) { @@ -141,7 +141,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u8, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u8); } } @@ -151,7 +151,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u8, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u8); } } @@ -161,7 +161,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u8, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u8); } } @@ -172,7 +172,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u8, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -183,7 +183,7 @@ TEST_REQUIRES_X86_SSE2; VUnaryMicrokernelTester() .batch_size(16) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u16, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u16); } TEST(F16_VNEG__SSE2_U16, batch_div_16) { @@ -192,7 +192,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u16, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u16); } } @@ -202,7 +202,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u16, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u16); } } @@ -212,7 +212,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u16, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u16); } } @@ -223,7 +223,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .TestNeg(xnn_f16_vneg_ukernel__sse2_u16, xnn_init_f16_neg_sse_params); + .TestNeg(xnn_f16_vneg_ukernel__sse2_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f16-vneg.yaml b/test/f16-vneg.yaml index 82b0c3e88535..0dbd68aa1206 100644 --- a/test/f16-vneg.yaml +++ b/test/f16-vneg.yaml @@ -9,6 +9,4 @@ # x86 SSE2 - name: xnn_f16_vneg_ukernel__sse2_u8 - init: xnn_init_f16_neg_sse_params - name: xnn_f16_vneg_ukernel__sse2_u16 - init: xnn_init_f16_neg_sse_params diff --git a/test/vcvt-microkernel-tester.cc b/test/vcvt-microkernel-tester.cc index 200ba3d1b260..5423249be5f9 100644 --- a/test/vcvt-microkernel-tester.cc +++ b/test/vcvt-microkernel-tester.cc @@ -28,8 +28,7 @@ #include "replicable_random_device.h" void VCvtMicrokernelTester::Test( - xnn_f16_f32_vcvt_ukernel_fn vcvt, - xnn_init_f16_f32_cvt_params_fn init_params) const { + xnn_f16_f32_vcvt_ukernel_fn vcvt) const { xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist(-100.0f, 100.0f); @@ -41,13 +40,8 @@ void VCvtMicrokernelTester::Test( [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); std::fill(output.begin(), output.end(), nanf("")); - union xnn_f16_f32_cvt_params params; - if (init_params != nullptr) { - init_params(¶ms); - } - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(uint16_t), input.data(), output.data(), ¶ms); + vcvt(batch_size() * sizeof(uint16_t), input.data(), output.data(), nullptr); // Verify results. for (size_t i = 0; i < batch_size(); i++) { diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h index 4d058bcae4da..e6d0562cac07 100644 --- a/test/vcvt-microkernel-tester.h +++ b/test/vcvt-microkernel-tester.h @@ -71,8 +71,7 @@ class VCvtMicrokernelTester { size_t iterations() const { return this->iterations_; } - void Test(xnn_f16_f32_vcvt_ukernel_fn vcvt, - xnn_init_f16_f32_cvt_params_fn init_params = nullptr) const; + void Test(xnn_f16_f32_vcvt_ukernel_fn vcvt) const; void Test(xnn_f32_f16_vcvt_ukernel_fn vcvt, xnn_init_f32_f16_cvt_params_fn init_params = nullptr) const; diff --git a/test/vunary-microkernel-tester.cc b/test/vunary-microkernel-tester.cc index 11ad6cc0b9ea..e1a75d4695d8 100644 --- a/test/vunary-microkernel-tester.cc +++ b/test/vunary-microkernel-tester.cc @@ -40,7 +40,7 @@ void VUnaryMicrokernelTester::Test(xnn_f32_vrelu_ukernel_fn vrelu) const { void VUnaryMicrokernelTester::TestAbs( xnn_bf16_vabs_ukernel_fn vabs, - xnn_init_bf16_abs_params_fn init_params) const { + xnn_init_bf16_default_params_fn init_params) const { TestBF16( vabs, InitParamsWrapper(init_params), [](float x) { return std::abs(x); }, TolExact16, -1.0f, 1.0f); @@ -48,7 +48,7 @@ void VUnaryMicrokernelTester::TestAbs( void VUnaryMicrokernelTester::TestAbs( xnn_f16_vabs_ukernel_fn vabs, - xnn_init_f16_abs_params_fn init_params) const { + xnn_init_f16_default_params_fn init_params) const { TestFP16( vabs, InitParamsWrapper(init_params), [](float x) { return std::abs(x); }, TolExact16, -1.0f, 1.0f); @@ -161,7 +161,7 @@ void VUnaryMicrokernelTester::Test( void VUnaryMicrokernelTester::TestNeg( xnn_f16_vneg_ukernel_fn vneg, - xnn_init_f16_neg_params_fn init_params) const { + xnn_init_f16_default_params_fn init_params) const { TestFP16( vneg, InitParamsWrapper(init_params), [](float x) { return -x; }, TolExact16, -1.0f, 1.0f); diff --git a/test/vunary-microkernel-tester.h b/test/vunary-microkernel-tester.h index 5f846eba35fb..492f43f57634 100644 --- a/test/vunary-microkernel-tester.h +++ b/test/vunary-microkernel-tester.h @@ -150,10 +150,10 @@ class VUnaryMicrokernelTester { void Test(xnn_f32_vrelu_ukernel_fn vrelu) const; void TestAbs(xnn_bf16_vabs_ukernel_fn vabs, - xnn_init_bf16_abs_params_fn init_params = nullptr) const; + xnn_init_bf16_default_params_fn init_params = nullptr) const; void TestAbs(xnn_f16_vabs_ukernel_fn vabs, - xnn_init_f16_abs_params_fn init_params = nullptr) const; + xnn_init_f16_default_params_fn init_params = nullptr) const; void TestAbs(xnn_f32_vabs_ukernel_fn vabs, xnn_init_f32_default_params_fn init_params = nullptr) const; @@ -189,7 +189,7 @@ class VUnaryMicrokernelTester { xnn_init_f32_default_params_fn init_params = nullptr) const; void TestNeg(xnn_f16_vneg_ukernel_fn vneg, - xnn_init_f16_neg_params_fn init_params = nullptr) const; + xnn_init_f16_default_params_fn init_params = nullptr) const; void TestNeg(xnn_f32_vneg_ukernel_fn vneg, xnn_init_f32_default_params_fn init_params = nullptr) const; diff --git a/tools/generate-vcvt-test.py b/tools/generate-vcvt-test.py index 2a13178ce915..b29c652560d6 100755 --- a/tools/generate-vcvt-test.py +++ b/tools/generate-vcvt-test.py @@ -51,15 +51,19 @@ def split_ukernel_name(name): CVT_BENCHMARK_TEMPLATE = """\ BENCHMARK_CAPTURE(${BENCHMARK_FN}, ${BENCHMARK_NAME}, - ${UKERNEL_NAME}, $if INIT_FN and ISA_CHECK: + ${UKERNEL_NAME}, ${INIT_FN}, benchmark::utils::${ISA_CHECK}) $elif INIT_FN: + ${UKERNEL_NAME}, ${INIT_FN}) $elif ISA_CHECK: + ${UKERNEL_NAME}, nullptr /* init params */, benchmark::utils::${ISA_CHECK}) + $else: + ${UKERNEL_NAME}) ->Apply(benchmark::utils::UnaryElementwiseParameters<${INPUT_CTYPE}, ${OUTPUT_CTYPE}>) ->UseRealTime(); """ diff --git a/tools/generate-vunary-benchmark.py b/tools/generate-vunary-benchmark.py index 57036184c026..1d84c2efc08f 100755 --- a/tools/generate-vunary-benchmark.py +++ b/tools/generate-vunary-benchmark.py @@ -105,7 +105,7 @@ def split_ukernel_name(name: str) -> tuple[str, str, str]: /*range_min=*/${RANGE_MIN}, /*range_max=*/${RANGE_MAX}); } -$elif OP_NAME == "sqr" or (DATATYPE == "f32" and OP_NAME in ("abs", "gelu", "log", "neg")): +$elif OP_NAME in ("abs", "gelu", "log", "neg", "sqr"): void ${DATATYPE}_v${OP_NAME}(benchmark::State& state, xnn_${DATATYPE}_v${OP_NAME}_ukernel_fn ukernel, xnn_init_${DATATYPE}_default_params_fn init_params = nullptr, benchmark::utils::IsaCheckFunction isa_check = nullptr) {