From 71721ec14b2b7396ecbfe806b3e016b0930af15c Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Wed, 31 Jul 2024 15:55:56 -0700 Subject: [PATCH] Remove f16_f32_vcvt_params struct PiperOrigin-RevId: 658171197 --- bench/f16-f32-vcvt.cc | 132 +++--- bench/vcvt-benchmark.h | 8 +- src/amalgam/gen/avx.c | 21 +- src/amalgam/gen/avx512skx.c | 2 +- src/amalgam/gen/f16c.c | 2 +- src/amalgam/gen/neon.c | 18 +- src/amalgam/gen/neonfp16.c | 2 +- src/amalgam/gen/scalar.c | 28 +- src/amalgam/gen/sse2.c | 21 +- src/amalgam/gen/sse41.c | 21 +- src/amalgam/gen/wasmrelaxedsimd.c | 21 +- src/amalgam/gen/wasmsimd.c | 21 +- src/configs/unary-elementwise-config.c | 10 - src/f16-f32-vcvt/avx512skx.c.in | 2 +- src/f16-f32-vcvt/f16c.c.in | 2 +- .../gen/f16-f32-vcvt-avx-int16-u16.c | 21 +- .../gen/f16-f32-vcvt-avx-int16-u24.c | 21 +- .../gen/f16-f32-vcvt-avx-int16-u32.c | 21 +- .../gen/f16-f32-vcvt-avx-int16-u8.c | 21 +- .../gen/f16-f32-vcvt-avx-int32-u16.c | 18 +- .../gen/f16-f32-vcvt-avx-int32-u24.c | 18 +- .../gen/f16-f32-vcvt-avx-int32-u32.c | 18 +- .../gen/f16-f32-vcvt-avx-int32-u8.c | 18 +- .../gen/f16-f32-vcvt-avx512skx-u16.c | 2 +- .../gen/f16-f32-vcvt-avx512skx-u32.c | 2 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c | 2 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c | 2 +- .../gen/f16-f32-vcvt-neon-int16-u16.c | 18 +- .../gen/f16-f32-vcvt-neon-int16-u24.c | 18 +- .../gen/f16-f32-vcvt-neon-int16-u32.c | 18 +- .../gen/f16-f32-vcvt-neon-int16-u8.c | 18 +- .../gen/f16-f32-vcvt-neon-int32-u16.c | 18 +- .../gen/f16-f32-vcvt-neon-int32-u24.c | 18 +- .../gen/f16-f32-vcvt-neon-int32-u32.c | 18 +- .../gen/f16-f32-vcvt-neon-int32-u8.c | 18 +- .../gen/f16-f32-vcvt-neonfp16-u16.c | 2 +- .../gen/f16-f32-vcvt-neonfp16-u8.c | 2 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c | 14 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c | 14 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c | 14 +- src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c | 14 +- .../gen/f16-f32-vcvt-sse2-int16-u16.c | 21 +- .../gen/f16-f32-vcvt-sse2-int16-u24.c | 21 +- .../gen/f16-f32-vcvt-sse2-int16-u32.c | 21 +- .../gen/f16-f32-vcvt-sse2-int16-u8.c | 21 +- .../gen/f16-f32-vcvt-sse2-int32-u16.c | 18 +- .../gen/f16-f32-vcvt-sse2-int32-u24.c | 18 +- .../gen/f16-f32-vcvt-sse2-int32-u32.c | 18 +- .../gen/f16-f32-vcvt-sse2-int32-u8.c | 18 +- .../gen/f16-f32-vcvt-sse41-int16-u16.c | 21 +- .../gen/f16-f32-vcvt-sse41-int16-u24.c | 21 +- .../gen/f16-f32-vcvt-sse41-int16-u32.c | 21 +- .../gen/f16-f32-vcvt-sse41-int16-u8.c | 21 +- .../gen/f16-f32-vcvt-sse41-int32-u16.c | 18 +- .../gen/f16-f32-vcvt-sse41-int32-u24.c | 18 +- .../gen/f16-f32-vcvt-sse41-int32-u32.c | 18 +- .../gen/f16-f32-vcvt-sse41-int32-u8.c | 18 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c | 21 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c | 21 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c | 21 +- .../f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c | 21 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c | 18 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c | 18 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c | 18 +- .../f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c | 18 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u16.c | 21 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u24.c | 21 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u32.c | 21 +- .../gen/f16-f32-vcvt-wasmsimd-int16-u8.c | 21 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u16.c | 18 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u24.c | 18 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u32.c | 18 +- .../gen/f16-f32-vcvt-wasmsimd-int32-u8.c | 18 +- src/f16-f32-vcvt/neon-int16.c.in | 18 +- src/f16-f32-vcvt/neon-int32.c.in | 18 +- src/f16-f32-vcvt/neonfp16.c.in | 2 +- src/f16-f32-vcvt/scalar.c.in | 14 +- src/f16-f32-vcvt/sse-int16.c.in | 21 +- src/f16-f32-vcvt/sse-int32.c.in | 18 +- src/f16-f32-vcvt/wasmsimd-int16.c.in | 21 +- src/f16-f32-vcvt/wasmsimd-int32.c.in | 18 +- src/microparams-init.c | 95 ---- src/operators/unary-elementwise-nc.c | 16 +- src/xnnpack/common.h | 15 + src/xnnpack/compute.h | 2 - src/xnnpack/config-types.h | 1 - src/xnnpack/microfnptr.h | 5 +- src/xnnpack/microparams-init.h | 18 - src/xnnpack/microparams.h | 50 --- src/xnnpack/operator.h | 1 - src/xnnpack/vcvt.h | 2 +- test/f16-f32-vcvt.cc | 412 +++++++++--------- test/f16-f32-vcvt.yaml | 52 --- test/vcvt-microkernel-tester.cc | 10 +- test/vcvt-microkernel-tester.h | 3 +- tools/generate-vcvt-test.py | 6 +- 96 files changed, 1123 insertions(+), 998 deletions(-) diff --git a/bench/f16-f32-vcvt.cc b/bench/f16-f32-vcvt.cc index 8eafa104d351..d4a26bcc8072 100644 --- a/bench/f16-f32-vcvt.cc +++ b/bench/f16-f32-vcvt.cc @@ -20,8 +20,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u8, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -29,8 +28,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u16, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -38,8 +36,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u24, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -47,8 +44,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int16_u32, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -56,8 +52,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u8, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -65,8 +60,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u16, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -74,8 +68,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u24, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -83,8 +76,7 @@ #if XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmrelaxedsimd_int32_u32, - xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -92,8 +84,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u8, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -101,8 +92,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u16, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -110,8 +100,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u24, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -119,8 +108,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int16_u32, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, - xnn_init_f16_f32_cvt_wasmsimd_int16_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -128,8 +116,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u8, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -137,8 +124,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u16, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -146,8 +132,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u24, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -155,8 +140,7 @@ #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD BENCHMARK_CAPTURE(f16_f32_vcvt, wasmsimd_int32_u32, - xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, - xnn_init_f16_f32_cvt_wasmsimd_int32_params) + xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -165,7 +149,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u8, xnn_f16_f32_vcvt_ukernel__neon_int16_u8, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -175,7 +159,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u16, xnn_f16_f32_vcvt_ukernel__neon_int16_u16, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -185,7 +169,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u24, xnn_f16_f32_vcvt_ukernel__neon_int16_u24, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -195,7 +179,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int16_u32, xnn_f16_f32_vcvt_ukernel__neon_int16_u32, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -205,7 +189,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u8, xnn_f16_f32_vcvt_ukernel__neon_int32_u8, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -215,7 +199,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u16, xnn_f16_f32_vcvt_ukernel__neon_int32_u16, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -225,7 +209,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u24, xnn_f16_f32_vcvt_ukernel__neon_int32_u24, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -235,7 +219,7 @@ #if XNN_ARCH_ARM || XNN_ARCH_ARM64 BENCHMARK_CAPTURE(f16_f32_vcvt, neon_int32_u32, xnn_f16_f32_vcvt_ukernel__neon_int32_u32, - xnn_init_f16_f32_cvt_neon_params, + nullptr /* init params */, benchmark::utils::CheckNEON) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -265,7 +249,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u8, xnn_f16_f32_vcvt_ukernel__avx_int16_u8, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -275,7 +259,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u16, xnn_f16_f32_vcvt_ukernel__avx_int16_u16, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -285,7 +269,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u24, xnn_f16_f32_vcvt_ukernel__avx_int16_u24, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -295,7 +279,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int16_u32, xnn_f16_f32_vcvt_ukernel__avx_int16_u32, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -305,7 +289,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u8, xnn_f16_f32_vcvt_ukernel__avx_int32_u8, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -315,7 +299,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u16, xnn_f16_f32_vcvt_ukernel__avx_int32_u16, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -325,7 +309,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u24, xnn_f16_f32_vcvt_ukernel__avx_int32_u24, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -335,7 +319,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, avx_int32_u32, xnn_f16_f32_vcvt_ukernel__avx_int32_u32, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -345,7 +329,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u8, xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -355,7 +339,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u16, xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -365,7 +349,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u24, xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -375,7 +359,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int16_u32, xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, - xnn_init_f16_f32_cvt_sse_int16_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -385,7 +369,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u8, xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -395,7 +379,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u16, xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -405,7 +389,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u24, xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -415,7 +399,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse41_int32_u32, xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, - xnn_init_f16_f32_cvt_sse_int32_params, + nullptr /* init params */, benchmark::utils::CheckSSE41) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); @@ -424,8 +408,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u8, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -433,8 +416,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u16, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -442,8 +424,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u24, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -451,8 +432,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int16_u32, - xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, - xnn_init_f16_f32_cvt_sse_int16_params) + xnn_f16_f32_vcvt_ukernel__sse2_int16_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -460,8 +440,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u8, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u8) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -469,8 +448,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u16, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u16) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -478,8 +456,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u24, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u24) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -487,8 +464,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32_vcvt, sse2_int32_u32, - xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, - xnn_init_f16_f32_cvt_sse_int32_params) + xnn_f16_f32_vcvt_ukernel__sse2_int32_u32) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -535,26 +511,22 @@ BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u1, - xnn_f16_f32_vcvt_ukernel__scalar_u1, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u1) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u2, - xnn_f16_f32_vcvt_ukernel__scalar_u2, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u2) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u3, - xnn_f16_f32_vcvt_ukernel__scalar_u3, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f16_f32_vcvt, scalar_u4, - xnn_f16_f32_vcvt_ukernel__scalar_u4, - xnn_init_f16_f32_cvt_scalar_params) + xnn_f16_f32_vcvt_ukernel__scalar_u4) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); diff --git a/bench/vcvt-benchmark.h b/bench/vcvt-benchmark.h index 4813678b7fff..b7512dadd215 100644 --- a/bench/vcvt-benchmark.h +++ b/bench/vcvt-benchmark.h @@ -26,7 +26,7 @@ namespace { static void f16_f32_vcvt( benchmark::State& state, xnn_f16_f32_vcvt_ukernel_fn cvt, - xnn_init_f16_f32_cvt_params_fn init_params = nullptr, + void* /*init_params*/ = nullptr, benchmark::utils::IsaCheckFunction isa_check = nullptr) { if (isa_check && !isa_check(state)) { @@ -45,12 +45,8 @@ static void f16_f32_vcvt( std::generate(x.begin(), x.end(), std::ref(f16rng)); std::fill(y.begin(), y.end(), std::nanf("")); - xnn_f16_f32_cvt_params params; - if (init_params != nullptr) { - init_params(¶ms); - } for (auto _ : state) { - cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), ¶ms); + cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), nullptr); } const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); diff --git a/src/amalgam/gen/avx.c b/src/amalgam/gen/avx.c index 268af024e487..986027c754a4 100644 --- a/src/amalgam/gen/avx.c +++ b/src/amalgam/gen/avx.c @@ -36,19 +36,26 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/avx512skx.c b/src/amalgam/gen/avx512skx.c index d626895d209e..1676a44f04b1 100644 --- a/src/amalgam/gen/avx512skx.c +++ b/src/amalgam/gen/avx512skx.c @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/amalgam/gen/f16c.c b/src/amalgam/gen/f16c.c index 43ba84a7c923..1fc0a3630f44 100644 --- a/src/amalgam/gen/f16c.c +++ b/src/amalgam/gen/f16c.c @@ -541,7 +541,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/amalgam/gen/neon.c b/src/amalgam/gen/neon.c index 98ce767f77a0..a163a8b3b8b5 100644 --- a/src/amalgam/gen/neon.c +++ b/src/amalgam/gen/neon.c @@ -55,18 +55,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); - const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); + const uint16x8_t vsign_mask = vmovq_n_u16(UINT16_C(0x8000)); + const uint16x8_t vexp_offset = vmovq_n_u16(UINT16_C(0x7000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint16x8_t vdenorm_cutoff = vmovq_n_u16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/neonfp16.c b/src/amalgam/gen/neonfp16.c index e0e82570d78d..96adf1682c94 100644 --- a/src/amalgam/gen/neonfp16.c +++ b/src/amalgam/gen/neonfp16.c @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/amalgam/gen/scalar.c b/src/amalgam/gen/scalar.c index 2d2080951453..27603008526e 100644 --- a/src/amalgam/gen/scalar.c +++ b/src/amalgam/gen/scalar.c @@ -57,19 +57,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u1( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; @@ -93,19 +93,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u4( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/amalgam/gen/sse2.c b/src/amalgam/gen/sse2.c index 61c76d834749..0f660f4cdc3c 100644 --- a/src/amalgam/gen/sse2.c +++ b/src/amalgam/gen/sse2.c @@ -48,19 +48,26 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/sse41.c b/src/amalgam/gen/sse41.c index fa0ad329bb26..a0fdb815a185 100644 --- a/src/amalgam/gen/sse41.c +++ b/src/amalgam/gen/sse41.c @@ -35,19 +35,26 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/wasmrelaxedsimd.c b/src/amalgam/gen/wasmrelaxedsimd.c index 3c5dbb7c16a9..b4f49c7f4ecf 100644 --- a/src/amalgam/gen/wasmrelaxedsimd.c +++ b/src/amalgam/gen/wasmrelaxedsimd.c @@ -31,19 +31,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/amalgam/gen/wasmsimd.c b/src/amalgam/gen/wasmsimd.c index 54e304b81f28..522c6cbd7b05 100644 --- a/src/amalgam/gen/wasmsimd.c +++ b/src/amalgam/gen/wasmsimd.c @@ -55,19 +55,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 97219df3be62..381e3ebae5b8 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -520,12 +520,10 @@ static void init_f16_to_f32_cvt_config(void) { f16_to_f32_cvt_config.element_tile = 16; } else { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__neon_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params; f16_to_f32_cvt_config.element_tile = 16; } } else if (!XNN_PLATFORM_MOBILE) { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 4; } #elif XNN_ARCH_ARM64 @@ -542,38 +540,30 @@ static void init_f16_to_f32_cvt_config(void) { f16_to_f32_cvt_config.element_tile = 16; } else if (hardware_config->use_x86_avx) { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__avx_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params; f16_to_f32_cvt_config.element_tile = 16; } else if (hardware_config->use_x86_sse4_1) { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse41_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params; f16_to_f32_cvt_config.element_tile = 16; } else { f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse2_int16_u32; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params; f16_to_f32_cvt_config.element_tile = 32; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD #if XNN_ARCH_WASMRELAXEDSIMD f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params; f16_to_f32_cvt_config.element_tile = 16; #else f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params; f16_to_f32_cvt_config.element_tile = 16; #endif #elif XNN_ARCH_WASM f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u1; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 1; #elif XNN_ARCH_RISCV f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 4; #else f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4; - f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params; f16_to_f32_cvt_config.element_tile = 4; #endif } diff --git a/src/f16-f32-vcvt/avx512skx.c.in b/src/f16-f32-vcvt/avx512skx.c.in index 7dad8050b752..e39daf5fd0cd 100644 --- a/src/f16-f32-vcvt/avx512skx.c.in +++ b/src/f16-f32-vcvt/avx512skx.c.in @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/f16c.c.in b/src/f16-f32-vcvt/f16c.c.in index 6375c5f46c97..2ffade5606f7 100644 --- a/src/f16-f32-vcvt/f16c.c.in +++ b/src/f16-f32-vcvt/f16c.c.in @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c index a37cb4b733af..a60709c6792a 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u16.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c index 561d6da2d7cb..376b5b1d862a 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u24.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c index 6ac769e6870e..0cde5c308847 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u32.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c index e7e6c584659f..0d4d73f3d634 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int16-u8.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c index 9d48308f9ee4..4fdb0da03fc8 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u16.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c index 889993fce68b..f5dd6aae36db 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u24.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c index 306b12c3696d..7aaa618b22ac 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u32.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c index a99b3ef5e4e8..a7a6cedc6b92 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx-int32-u8.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__avx_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c index aa5238944701..a05298e5d170 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c @@ -20,7 +20,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c index 0fa2a24f0208..a38276047c08 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u32.c @@ -20,7 +20,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c index fecce772f307..0910270851fb 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c index 93e3316b41d0..a0de07669a06 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u8.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c index d384a26d592f..8332ba6b564f 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u16.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); - const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); + const uint16x8_t vsign_mask = vmovq_n_u16(UINT16_C(0x8000)); + const uint16x8_t vexp_offset = vmovq_n_u16(UINT16_C(0x7000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint16x8_t vdenorm_cutoff = vmovq_n_u16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c index 91a53e3c2634..3c0c8237cbe1 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u24.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); - const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); + const uint16x8_t vsign_mask = vmovq_n_u16(UINT16_C(0x8000)); + const uint16x8_t vexp_offset = vmovq_n_u16(UINT16_C(0x7000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint16x8_t vdenorm_cutoff = vmovq_n_u16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c index 9fece0562a86..5566d17f29e8 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u32.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); - const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); + const uint16x8_t vsign_mask = vmovq_n_u16(UINT16_C(0x8000)); + const uint16x8_t vexp_offset = vmovq_n_u16(UINT16_C(0x7000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint16x8_t vdenorm_cutoff = vmovq_n_u16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c index 295ede8d5186..26bcbbf315f6 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int16-u8.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); - const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); + const uint16x8_t vsign_mask = vmovq_n_u16(UINT16_C(0x8000)); + const uint16x8_t vexp_offset = vmovq_n_u16(UINT16_C(0x7000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint16x8_t vdenorm_cutoff = vmovq_n_u16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c index 5daba5e9dc09..a1ffee01a546 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u16.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); - const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); + const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); + const uint32x4_t vexp_offset = vmovq_n_u32(UINT32_C(0x70000000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint32x4_t vdenorm_cutoff = vmovq_n_u32(UINT32_C(0x04000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c index 5cc267a8976e..9a21d72440bd 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u24.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); - const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); + const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); + const uint32x4_t vexp_offset = vmovq_n_u32(UINT32_C(0x70000000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint32x4_t vdenorm_cutoff = vmovq_n_u32(UINT32_C(0x04000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c index ace982a95239..6ec3d269cfbb 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u32.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); - const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); + const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); + const uint32x4_t vexp_offset = vmovq_n_u32(UINT32_C(0x70000000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint32x4_t vdenorm_cutoff = vmovq_n_u32(UINT32_C(0x04000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c index 37d277e74f95..0cf34601f63c 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neon-int32-u8.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); - const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); + const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); + const uint32x4_t vexp_offset = vmovq_n_u32(UINT32_C(0x70000000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint32x4_t vdenorm_cutoff = vmovq_n_u32(UINT32_C(0x04000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c index a2c83b744753..34e13651cf39 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u16.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c index 7c4c34a4a32f..2529624c243c 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-neonfp16-u8.c @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c index b167df241c62..25b8e890c9e9 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u1.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u1( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c index 672ee8bac2cc..ccc58a275857 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u2.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u2( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c index 6a4a68ae0270..d07550c6ddd4 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u3.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u3( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c index bd72fb9d5475..d667076a4285 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-scalar-u4.c @@ -18,19 +18,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u4( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c index 8518a3941b71..c2f1dd74575f 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u16.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c index 8618f310628d..314263dcc4b1 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u24.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c index 38bdb678ef83..66ce56d79029 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u32.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c index deef2ecef330..4fd8792e02b1 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int16-u8.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c index 685f637539ae..84abb4a09747 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u16.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c index c1853e9a3ca7..1914c7d8c7c5 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u24.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c index 9fec9aa79644..568003a39889 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u32.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c index 9b8737bec203..91b627870375 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse2-int32-u8.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c index 086f2c24c333..ec512cd838c2 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u16.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c index 3c1df5bc4ae0..4891e7bf1f62 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u24.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c index 611ffe3d93ef..f0f5759a8f7f 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u32.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c index a19fd62ca64b..67443dd30fd6 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int16-u8.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c index 54a732ef0c54..6c28f79b3cd3 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u16.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c index 907be68c46f6..9b247f8c1eb5 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u24.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c index d81a4a08d7af..6f278265bf9f 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u32.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c index f357a1c4f4da..fc094f7af74c 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-sse41-int32-u8.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c index 0c1c9dffdcd7..289a672a7997 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u16.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c index 8300a2a5629a..900d6c9b9371 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u24.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c index 327a314af225..019665141801 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u32.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c index 67621cae33c2..77e978c60ad9 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int16-u8.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c index f5af2d4b0cb4..4490a137bcb4 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u16.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c index 3ed10c69f7ba..16be80497380 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u24.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c index 9044b77c7639..8f343ff6582f 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u32.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c index 3a4ebf19b121..03c77a121820 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmrelaxedsimd-int32-u8.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c index 1b83337cad79..dbf5dcdc1317 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u16.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c index ff44bcfb93e4..5d7dd5017003 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u24.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c index c32049f94de8..356965898d27 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u32.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c index c4cd07bb56d8..be1a18622cbd 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int16-u8.c @@ -19,19 +19,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c index 74b0d2b3dd4e..a7605827b27f 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u16.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c index 6040dd0af75d..d9cc5d5e2acd 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u24.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 24 * sizeof(uint16_t); batch -= 24 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c index 95e914da203b..7b8633dd8019 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u32.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c index aeff3ee38cc4..54b072a93d25 100644 --- a/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c +++ b/src/f16-f32-vcvt/gen/f16-f32-vcvt-wasmsimd-int32-u8.c @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) { diff --git a/src/f16-f32-vcvt/neon-int16.c.in b/src/f16-f32-vcvt/neon-int16.c.in index 711e9fa7d0c4..de1e3eba7c44 100644 --- a/src/f16-f32-vcvt/neon-int16.c.in +++ b/src/f16-f32-vcvt/neon-int16.c.in @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint16x8_t vsign_mask = vmovq_n_u16(0x8000); - const uint16x8_t vexp_offset = vmovq_n_u16(0x7000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400); + const uint16x8_t vsign_mask = vmovq_n_u16(UINT16_C(0x8000)); + const uint16x8_t vexp_offset = vmovq_n_u16(UINT16_C(0x7000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint16x8_t vdenorm_cutoff = vmovq_n_u16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/neon-int32.c.in b/src/f16-f32-vcvt/neon-int32.c.in index 159cc0dbf8e1..2ce4533f127b 100644 --- a/src/f16-f32-vcvt/neon-int32.c.in +++ b/src/f16-f32-vcvt/neon-int32.c.in @@ -19,18 +19,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int32_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32x4_t vsign_mask = vmovq_n_u32(0x80000000); - const uint32x4_t vexp_offset = vmovq_n_u32(0x70000000); - const float32x4_t vexp_scale = vld1q_dup_f32(¶ms->neon.exp_scale); - const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000); - const uint32x4_t vdenorm_cutoff = vmovq_n_u32(0x04000000); + const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000)); + const uint32x4_t vexp_offset = vmovq_n_u32(UINT32_C(0x70000000)); + const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f); + const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000)); + const uint32x4_t vdenorm_cutoff = vmovq_n_u32(UINT32_C(0x04000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/neonfp16.c.in b/src/f16-f32-vcvt/neonfp16.c.in index bef53842ac99..9ece4d211cf4 100644 --- a/src/f16-f32-vcvt/neonfp16.c.in +++ b/src/f16-f32-vcvt/neonfp16.c.in @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); diff --git a/src/f16-f32-vcvt/scalar.c.in b/src/f16-f32-vcvt/scalar.c.in index 00021adca45c..cf4d330b8f72 100644 --- a/src/f16-f32-vcvt/scalar.c.in +++ b/src/f16-f32-vcvt/scalar.c.in @@ -16,19 +16,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) + const void* params) { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const uint32_t vsign_mask = params->scalar.sign_mask; - const uint32_t vexp_offset = params->scalar.exp_offset; - const float vexp_scale = params->scalar.exp_scale; - const uint32_t vmagic_mask = params->scalar.magic_mask; - const float vmagic_bias = params->scalar.magic_bias; - const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff; + const uint32_t vsign_mask = 0x80000000; + const uint32_t vexp_offset = 0x70000000; + const float vexp_scale = 0x1.0p-112f; + const uint32_t vmagic_mask = 0x3F000000; + const float vmagic_bias = 0.5f; + const uint32_t vdenorm_cutoff = 0x08000000; const uint16_t* i = (const uint16_t*) input; uint32_t* o = (uint32_t*) output; diff --git a/src/f16-f32-vcvt/sse-int16.c.in b/src/f16-f32-vcvt/sse-int16.c.in index 830c12e874a6..5114214ab9fe 100644 --- a/src/f16-f32-vcvt/sse-int16.c.in +++ b/src/f16-f32-vcvt/sse-int16.c.in @@ -23,19 +23,26 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale); - const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask); - const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000)); + const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00)); + const __m128 vmagic_bias = _mm_set1_ps(0.5f); + const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/sse-int32.c.in b/src/f16-f32-vcvt/sse-int32.c.in index 7f969410c981..af07399e1508 100644 --- a/src/f16-f32-vcvt/sse-int32.c.in +++ b/src/f16-f32-vcvt/sse-int32.c.in @@ -23,18 +23,24 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int32_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int32.sign_mask); - const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int32.exp_offset); - const __m128 vexp_scale = _mm_load_ps(params->sse_int32.exp_scale); - const __m128i vmagic_bias = _mm_load_si128((const __m128i*) params->sse_int32.magic_bias); - const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int32.denorm_cutoff); + const __m128i vsign_mask = _mm_set1_epi32(UINT32_C(0x80000000)); + const __m128i vexp_offset = _mm_set1_epi32(UINT32_C(0x70000000)); + const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f); + const __m128i vmagic_bias = _mm_set1_epi32(UINT32_C(0x3F000000)); + const __m128i vdenorm_cutoff = _mm_set1_epi32(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/wasmsimd-int16.c.in b/src/f16-f32-vcvt/wasmsimd-int16.c.in index e1e7307a97ef..5b23ce13bd37 100644 --- a/src/f16-f32-vcvt/wasmsimd-int16.c.in +++ b/src/f16-f32-vcvt/wasmsimd-int16.c.in @@ -21,19 +21,26 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int16_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale); - const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff); + const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000)); + const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00)); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_mask); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/f16-f32-vcvt/wasmsimd-int32.c.in b/src/f16-f32-vcvt/wasmsimd-int32.c.in index d133a1e3c6df..32f52f49c4fc 100644 --- a/src/f16-f32-vcvt/wasmsimd-int32.c.in +++ b/src/f16-f32-vcvt/wasmsimd-int32.c.in @@ -21,18 +21,24 @@ void xnn_f16_f32_vcvt_ukernel__${ISA}_int32_u${BATCH_TILE}( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + const void* params) XNN_OOB_READS { assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); assert(output != NULL); - const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int32.sign_mask); - const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int32.exp_offset); - const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int32.exp_scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int32.magic_bias); - const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int32.denorm_cutoff); + const v128_t vsign_mask = wasm_i32x4_const_splat(UINT32_C(0x80000000)); + const v128_t vexp_offset = wasm_i32x4_const_splat(UINT32_C(0x70000000)); + const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f); + const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f); + const v128_t vdenorm_cutoff = wasm_i32x4_const_splat(UINT32_C(0x08000000)); + + XNN_FORCE_REALIZATION(vsign_mask); + XNN_FORCE_REALIZATION(vexp_offset); + XNN_FORCE_REALIZATION(vexp_scale); + XNN_FORCE_REALIZATION(vmagic_bias); + XNN_FORCE_REALIZATION(vdenorm_cutoff); const uint16_t* i = (const uint16_t*) input; $if BATCH_TILE > 8: diff --git a/src/microparams-init.c b/src/microparams-init.c index ca2062f73e21..4f2f08a2de6b 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -6803,101 +6803,6 @@ size_t xnn_init_qs8_mul_minmax_fp32_wasmsimd_params( } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -size_t xnn_init_f16_f32_cvt_scalar_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - params->scalar.sign_mask = UINT32_C(0x80000000); - params->scalar.exp_offset = UINT32_C(0x70000000); - params->scalar.exp_scale = 0x1.0p-112f; - params->scalar.magic_mask = UINT32_C(0x3F000000); - params->scalar.magic_bias = 0.5f; - params->scalar.denorm_cutoff = UINT32_C(0x08000000); - return sizeof(params->scalar); -} - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -size_t xnn_init_f16_f32_cvt_neon_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - params->neon.exp_scale = 0x1.0p-112f; - return sizeof(params->neon); -} -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f16_f32_cvt_sse_int16_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 8; i++) { - params->sse_int16.sign_mask[i] = UINT16_C(0x8000); - params->sse_int16.exp_offset[i] = UINT16_C(0x7000); - } - for (uint32_t i = 0; i < 4; i++) { - params->sse_int16.exp_scale[i] = 0x1.0p-112f; - } - for (uint32_t i = 0; i < 8; i++) { - params->sse_int16.magic_mask[i] = UINT16_C(0x3F00); - } - for (uint32_t i = 0; i < 4; i++) { - params->sse_int16.magic_bias[i] = 0.5f; - } - for (uint32_t i = 0; i < 8; i++) { - params->sse_int16.denorm_cutoff[i] = INT16_C(0x0400); - } - return sizeof(params->sse_int16); -} - -size_t xnn_init_f16_f32_cvt_sse_int32_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 4; i++) { - params->sse_int32.sign_mask[i] = UINT32_C(0x80000000); - params->sse_int32.exp_offset[i] = UINT32_C(0x70000000); - params->sse_int32.exp_scale[i] = 0x1.0p-112f; - params->sse_int32.magic_bias[i] = UINT32_C(0x3F000000); - params->sse_int32.denorm_cutoff[i] = INT32_C(0x04000000); - } - return sizeof(params->sse_int32); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -size_t xnn_init_f16_f32_cvt_wasmsimd_int16_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 4; i++) { - params->wasmsimd_int16.sign_mask[i] = UINT16_C(0x8000); - params->wasmsimd_int16.exp_offset[i] = UINT16_C(0x7000); - } - for (uint32_t i = 0; i < 2; i++) { - params->wasmsimd_int16.exp_scale[i] = 0x1.0p-112f; - } - for (uint32_t i = 0; i < 4; i++) { - params->wasmsimd_int16.magic_mask[i] = UINT16_C(0x3F00); - } - for (uint32_t i = 0; i < 2; i++) { - params->wasmsimd_int16.magic_bias[i] = 0.5f; - } - for (uint32_t i = 0; i < 4; i++) { - params->wasmsimd_int16.denorm_cutoff[i] = INT16_C(0x0400); - } - return sizeof(params->wasmsimd_int16); -} - -size_t xnn_init_f16_f32_cvt_wasmsimd_int32_params( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 2; i++) { - params->wasmsimd_int32.sign_mask[i] = UINT32_C(0x80000000); - params->wasmsimd_int32.exp_offset[i] = UINT32_C(0x70000000); - params->wasmsimd_int32.exp_scale[i] = 0x1.0p-112f; - params->wasmsimd_int32.magic_bias[i] = UINT32_C(0x3F000000); - params->wasmsimd_int32.denorm_cutoff[i] = INT32_C(0x04000000); - } - return sizeof(params->wasmsimd_int32); -} -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - size_t xnn_init_f32_f16_cvt_scalar_bitcast_params( union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)]) { diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c index 95f5be1f7910..98dd756fc97c 100644 --- a/src/operators/unary-elementwise-nc.c +++ b/src/operators/unary-elementwise-nc.c @@ -496,14 +496,9 @@ enum xnn_status xnn_create_convert_nc_f16_f32( { const struct xnn_unary_elementwise_config* f16_to_f32_cvt_config = xnn_init_f16_to_f32_cvt_config(); - union xnn_f16_f32_cvt_params params; - if (f16_to_f32_cvt_config != NULL && f16_to_f32_cvt_config->init.f16_f32_cvt != NULL) { - f16_to_f32_cvt_config->init.f16_f32_cvt(¶ms); - } - return create_unary_elementwise_nc( flags, f16_to_f32_cvt_config, /*rminmax_config=*/NULL, - ¶ms, sizeof(params), + /*params=*/NULL, /*params_size=*/0, xnn_operator_type_convert_nc_f16_f32, convert_op_out); } @@ -1689,7 +1684,7 @@ enum xnn_status xnn_reshape_convert_nc_f16_f32( channels, input_stride, output_stride, /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, - &convert_op->params.f16_f32_cvt, sizeof(convert_op->params.f16_f32_cvt), + /*params=*/NULL, /*params_size=*/0, threadpool); } @@ -3373,16 +3368,11 @@ enum xnn_status xnn_run_convert_nc_f16_f32( { const struct xnn_unary_elementwise_config* f16_to_f32_cvt_config = xnn_init_f16_to_f32_cvt_config(); - union xnn_f16_f32_cvt_params params; - if XNN_LIKELY(f16_to_f32_cvt_config != NULL && f16_to_f32_cvt_config->init.f16_f32_cvt != NULL) { - f16_to_f32_cvt_config->init.f16_f32_cvt(¶ms); - } - return run_unary_elementwise_nc( xnn_operator_type_convert_nc_f16_f32, channels, input_stride, output_stride, batch_size, input, output, - f16_to_f32_cvt_config, ¶ms, sizeof(params), + f16_to_f32_cvt_config, /*params=*/NULL, /*params_size=*/0, /*log2_input_size=*/XNN_LOG2_SIZEOF_HALF, /*log2_output_size=*/XNN_LOG2_SIZEOF_FLOAT, flags, diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h index 93069fde067e..448bbd8a6369 100644 --- a/src/xnnpack/common.h +++ b/src/xnnpack/common.h @@ -381,6 +381,21 @@ #define XNN_MULTIPASS_EXTRA_BYTES 16 #endif +#if XNN_ARCH_ARM || XNN_ARCH_X86 || XNN_ARCH_X86_64 || XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + // These architectures are slow to broadcast, the compiler tries to move them + // into loops, and when it runs out of registers, it will redundantly perform + // the broadcast. Marking them volatile prevents these from being moved into + // loops, and they spill as broadcasted vectors instead. + #if defined(__GNUC__) + #define XNN_FORCE_REALIZATION(x) __asm volatile(""::"m"(x)); + #else + #define XNN_FORCE_REALIZATION(x) + #endif +#else + #define XNN_FORCE_REALIZATION(x) +#endif + +// TODO(dsharlet): Remove this in favor of XNN_FORCE_REALIZATION above. #if XNN_ARCH_ARM || XNN_ARCH_X86 // These architectures are slow to broadcast, the compiler tries to move them // into loops, and when it runs out of registers, it will redundantly perform diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h index 8e567abec777..fd329752d027 100644 --- a/src/xnnpack/compute.h +++ b/src/xnnpack/compute.h @@ -1346,7 +1346,6 @@ struct univector_strided_context { xnn_vunary_ukernel_fn ukernel; union { union xnn_f16_default_params f16_default; - union xnn_f16_f32_cvt_params f16_f32_cvt; union xnn_f16_hswish_params f16_hswish; union xnn_f16_lrelu_params f16_lrelu; union xnn_f16_minmax_params f16_minmax; @@ -1394,7 +1393,6 @@ struct univector_contiguous_context { xnn_vunary_ukernel_fn ukernel; union { union xnn_f16_default_params f16_default; - union xnn_f16_f32_cvt_params f16_f32_cvt; union xnn_f16_hswish_params f16_hswish; union xnn_f16_lrelu_params f16_lrelu; union xnn_f16_minmax_params f16_minmax; diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index bee14e609b7c..bd2ecdee13de 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -80,7 +80,6 @@ struct xnn_binary_elementwise_config { struct xnn_unary_elementwise_config { xnn_vunary_ukernel_fn ukernel; union { - xnn_init_f16_f32_cvt_params_fn f16_f32_cvt; xnn_init_f16_qs8_cvt_params_fn f16_qs8_cvt; xnn_init_f16_default_params_fn f16_default; xnn_init_f16_elu_params_fn f16_elu; diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 17419f449eaf..549d0c3b72eb 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1824,7 +1824,7 @@ typedef void (*xnn_f16_f32_vcvt_ukernel_fn)( size_t batch, const void* input, float* output, - const union xnn_f16_f32_cvt_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + const void* params); typedef void (*xnn_f16_qs8_vcvt_ukernel_fn)( size_t batch, @@ -2512,9 +2512,6 @@ typedef void (*xnn_f32_vscaleextexp_ukernel_fn)( /***************** Microkernel parameter initializer pointers ****************/ -typedef size_t (*xnn_init_f16_f32_cvt_params_fn)( - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]); - typedef size_t (*xnn_init_f16_qs8_cvt_params_fn)( union xnn_f16_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)], uint16_t scale, diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index 9920a32a48fe..519bd44bfc24 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -1044,24 +1044,6 @@ DECLARE_INIT_QU8_MUL_MINMAX_PARAMS_FUNCTION(xnn_init_qu8_mul_minmax_fp32_scalar_ #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#define DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(fn_name) \ - XNN_INTERNAL size_t fn_name( \ - union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)]); - -DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_scalar_params) -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_neon_params) -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_sse_int16_params) - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_sse_int32_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_wasmsimd_int16_params) - DECLARE_INIT_F16_F32_CVT_PARAMS_FUNCTION(xnn_init_f16_f32_cvt_wasmsimd_int32_params) -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - #define DECLARE_INIT_F32_F16_CVT_PARAMS_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)]); diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index 6b9cb9d0b72e..eb5f3eeb6c71 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -1259,56 +1259,6 @@ union xnn_qu8_avgpool_minmax_params { // Cvt (Convert): used by VCVT microkernels. -union xnn_f16_f32_cvt_params { - struct { - uint32_t sign_mask; - uint32_t exp_offset; - float exp_scale; - uint32_t magic_mask; - float magic_bias; - uint32_t denorm_cutoff; - } scalar; -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - struct { - float exp_scale; - } neon; -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) uint16_t sign_mask[8]; - XNN_ALIGN(16) uint16_t exp_offset[8]; - XNN_ALIGN(16) float exp_scale[4]; - XNN_ALIGN(16) uint16_t magic_mask[8]; - XNN_ALIGN(16) float magic_bias[4]; - XNN_ALIGN(16) int16_t denorm_cutoff[8]; - } sse_int16; - struct { - XNN_ALIGN(16) uint32_t sign_mask[4]; - XNN_ALIGN(16) uint32_t exp_offset[4]; - XNN_ALIGN(16) float exp_scale[4]; - XNN_ALIGN(16) uint32_t magic_bias[4]; - XNN_ALIGN(16) int32_t denorm_cutoff[4]; - } sse_int32; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - struct { - XNN_ALIGN(8) uint16_t sign_mask[4]; - XNN_ALIGN(8) uint16_t exp_offset[4]; - XNN_ALIGN(8) float exp_scale[2]; - XNN_ALIGN(8) uint16_t magic_mask[4]; - XNN_ALIGN(8) float magic_bias[2]; - XNN_ALIGN(8) int16_t denorm_cutoff[4]; - } wasmsimd_int16; - struct { - XNN_ALIGN(8) uint32_t sign_mask[2]; - XNN_ALIGN(8) uint32_t exp_offset[2]; - XNN_ALIGN(8) float exp_scale[2]; - XNN_ALIGN(8) uint32_t magic_bias[2]; - XNN_ALIGN(8) int32_t denorm_cutoff[2]; - } wasmsimd_int32; -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -}; - union xnn_f32_f16_cvt_params { struct { uint32_t nonsign_mask; diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index f911d72c5dd6..3185c823fa46 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -213,7 +213,6 @@ struct xnn_operator { union { union xnn_f16_default_params f16_default; - union xnn_f16_f32_cvt_params f16_f32_cvt; union xnn_f16_hswish_params f16_hswish; union xnn_f16_elu_params f16_elu; union xnn_f16_lrelu_params f16_lrelu; diff --git a/src/xnnpack/vcvt.h b/src/xnnpack/vcvt.h index 4e66f1cda6dd..f526983c3eae 100644 --- a/src/xnnpack/vcvt.h +++ b/src/xnnpack/vcvt.h @@ -21,7 +21,7 @@ extern "C" { size_t n, \ const void* input, \ float* output, \ - const union xnn_f16_f32_cvt_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); + const void* params); DECLARE_F16_F32_VCVT_UKERNEL_FUNCTION(xnn_f16_f32_vcvt_ukernel__neon_int16_u8) DECLARE_F16_F32_VCVT_UKERNEL_FUNCTION(xnn_f16_f32_vcvt_ukernel__neon_int16_u16) diff --git a/test/f16-f32-vcvt.cc b/test/f16-f32-vcvt.cc index 4935c9a82d64..94d9826ccd00 100644 --- a/test/f16-f32-vcvt.cc +++ b/test/f16-f32-vcvt.cc @@ -22,7 +22,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } TEST(F16_F32_VCVT__NEON_INT16_U8, batch_div_8) { @@ -30,7 +30,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } } @@ -39,7 +39,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } } @@ -48,7 +48,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u8); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -59,7 +59,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } TEST(F16_F32_VCVT__NEON_INT16_U16, batch_div_16) { @@ -67,7 +67,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } } @@ -76,7 +76,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } } @@ -85,7 +85,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u16); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -96,7 +96,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } TEST(F16_F32_VCVT__NEON_INT16_U24, batch_div_24) { @@ -104,7 +104,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } } @@ -113,7 +113,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } } @@ -122,7 +122,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u24); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -133,7 +133,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } TEST(F16_F32_VCVT__NEON_INT16_U32, batch_div_32) { @@ -141,7 +141,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } } @@ -150,7 +150,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } } @@ -159,7 +159,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int16_u32); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -170,7 +170,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } TEST(F16_F32_VCVT__NEON_INT32_U8, batch_div_8) { @@ -178,7 +178,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } } @@ -187,7 +187,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } } @@ -196,7 +196,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u8); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -207,7 +207,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } TEST(F16_F32_VCVT__NEON_INT32_U16, batch_div_16) { @@ -215,7 +215,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } } @@ -224,7 +224,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } } @@ -233,7 +233,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u16); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -244,7 +244,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } TEST(F16_F32_VCVT__NEON_INT32_U24, batch_div_24) { @@ -252,7 +252,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } } @@ -261,7 +261,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } } @@ -270,7 +270,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u24); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -281,7 +281,7 @@ TEST_REQUIRES_ARM_NEON; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } TEST(F16_F32_VCVT__NEON_INT32_U32, batch_div_32) { @@ -289,7 +289,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } } @@ -298,7 +298,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } } @@ -307,7 +307,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32, xnn_init_f16_f32_cvt_neon_params); + .Test(xnn_f16_f32_vcvt_ukernel__neon_int32_u32); } } #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 @@ -392,7 +392,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } TEST(F16_F32_VCVT__SSE2_INT16_U8, batch_div_8) { @@ -400,7 +400,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } } @@ -409,7 +409,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } } @@ -418,7 +418,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -429,7 +429,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } TEST(F16_F32_VCVT__SSE2_INT16_U16, batch_div_16) { @@ -437,7 +437,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } } @@ -446,7 +446,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } } @@ -455,7 +455,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -466,7 +466,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } TEST(F16_F32_VCVT__SSE2_INT16_U24, batch_div_24) { @@ -474,7 +474,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } } @@ -483,7 +483,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } } @@ -492,7 +492,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -503,7 +503,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } TEST(F16_F32_VCVT__SSE2_INT16_U32, batch_div_32) { @@ -511,7 +511,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } } @@ -520,7 +520,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } } @@ -529,7 +529,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int16_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -540,7 +540,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } TEST(F16_F32_VCVT__SSE2_INT32_U8, batch_div_8) { @@ -548,7 +548,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } } @@ -557,7 +557,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } } @@ -566,7 +566,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -577,7 +577,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } TEST(F16_F32_VCVT__SSE2_INT32_U16, batch_div_16) { @@ -585,7 +585,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } } @@ -594,7 +594,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } } @@ -603,7 +603,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -614,7 +614,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } TEST(F16_F32_VCVT__SSE2_INT32_U24, batch_div_24) { @@ -622,7 +622,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } } @@ -631,7 +631,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } } @@ -640,7 +640,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -651,7 +651,7 @@ TEST_REQUIRES_X86_SSE2; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } TEST(F16_F32_VCVT__SSE2_INT32_U32, batch_div_32) { @@ -659,7 +659,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } } @@ -668,7 +668,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } } @@ -677,7 +677,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse2_int32_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -688,7 +688,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } TEST(F16_F32_VCVT__SSE41_INT16_U8, batch_div_8) { @@ -696,7 +696,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } } @@ -705,7 +705,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } } @@ -714,7 +714,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -725,7 +725,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } TEST(F16_F32_VCVT__SSE41_INT16_U16, batch_div_16) { @@ -733,7 +733,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } } @@ -742,7 +742,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } } @@ -751,7 +751,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -762,7 +762,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } TEST(F16_F32_VCVT__SSE41_INT16_U24, batch_div_24) { @@ -770,7 +770,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } } @@ -779,7 +779,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } } @@ -788,7 +788,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -799,7 +799,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } TEST(F16_F32_VCVT__SSE41_INT16_U32, batch_div_32) { @@ -807,7 +807,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } } @@ -816,7 +816,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } } @@ -825,7 +825,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int16_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -836,7 +836,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } TEST(F16_F32_VCVT__SSE41_INT32_U8, batch_div_8) { @@ -844,7 +844,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } } @@ -853,7 +853,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } } @@ -862,7 +862,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -873,7 +873,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } TEST(F16_F32_VCVT__SSE41_INT32_U16, batch_div_16) { @@ -881,7 +881,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } } @@ -890,7 +890,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } } @@ -899,7 +899,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -910,7 +910,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } TEST(F16_F32_VCVT__SSE41_INT32_U24, batch_div_24) { @@ -918,7 +918,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } } @@ -927,7 +927,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } } @@ -936,7 +936,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -947,7 +947,7 @@ TEST_REQUIRES_X86_SSE41; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } TEST(F16_F32_VCVT__SSE41_INT32_U32, batch_div_32) { @@ -955,7 +955,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } } @@ -964,7 +964,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } } @@ -973,7 +973,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__sse41_int32_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -984,7 +984,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } TEST(F16_F32_VCVT__AVX_INT16_U8, batch_div_8) { @@ -992,7 +992,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } } @@ -1001,7 +1001,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } } @@ -1010,7 +1010,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1021,7 +1021,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } TEST(F16_F32_VCVT__AVX_INT16_U16, batch_div_16) { @@ -1029,7 +1029,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } } @@ -1038,7 +1038,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } } @@ -1047,7 +1047,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1058,7 +1058,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } TEST(F16_F32_VCVT__AVX_INT16_U24, batch_div_24) { @@ -1066,7 +1066,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } } @@ -1075,7 +1075,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } } @@ -1084,7 +1084,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1095,7 +1095,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } TEST(F16_F32_VCVT__AVX_INT16_U32, batch_div_32) { @@ -1103,7 +1103,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } } @@ -1112,7 +1112,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } } @@ -1121,7 +1121,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32, xnn_init_f16_f32_cvt_sse_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int16_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1132,7 +1132,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } TEST(F16_F32_VCVT__AVX_INT32_U8, batch_div_8) { @@ -1140,7 +1140,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } } @@ -1149,7 +1149,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } } @@ -1158,7 +1158,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1169,7 +1169,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } TEST(F16_F32_VCVT__AVX_INT32_U16, batch_div_16) { @@ -1177,7 +1177,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } } @@ -1186,7 +1186,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } } @@ -1195,7 +1195,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1206,7 +1206,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } TEST(F16_F32_VCVT__AVX_INT32_U24, batch_div_24) { @@ -1214,7 +1214,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } } @@ -1223,7 +1223,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } } @@ -1232,7 +1232,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u24); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1243,7 +1243,7 @@ TEST_REQUIRES_X86_AVX; VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } TEST(F16_F32_VCVT__AVX_INT32_U32, batch_div_32) { @@ -1251,7 +1251,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } } @@ -1260,7 +1260,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } } @@ -1269,7 +1269,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32, xnn_init_f16_f32_cvt_sse_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__avx_int32_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1427,14 +1427,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } } @@ -1442,7 +1442,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } } @@ -1450,7 +1450,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1460,14 +1460,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } } @@ -1475,7 +1475,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } } @@ -1483,7 +1483,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1493,14 +1493,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } } @@ -1508,7 +1508,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } } @@ -1516,7 +1516,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1526,14 +1526,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT16_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } TEST(F16_F32_VCVT__WASMSIMD_INT16_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } } @@ -1541,7 +1541,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } } @@ -1549,7 +1549,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1559,14 +1559,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } } @@ -1574,7 +1574,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } } @@ -1582,7 +1582,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1592,14 +1592,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } } @@ -1607,7 +1607,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } } @@ -1615,7 +1615,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1625,14 +1625,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } } @@ -1640,7 +1640,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } } @@ -1648,7 +1648,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1658,14 +1658,14 @@ TEST(F16_F32_VCVT__WASMSIMD_INT32_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } TEST(F16_F32_VCVT__WASMSIMD_INT32_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } } @@ -1673,7 +1673,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } } @@ -1681,7 +1681,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32); } } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1691,14 +1691,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } } @@ -1706,7 +1706,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } } @@ -1714,7 +1714,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1724,14 +1724,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } } @@ -1739,7 +1739,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } } @@ -1747,7 +1747,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1757,14 +1757,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } } @@ -1772,7 +1772,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } } @@ -1780,7 +1780,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1790,14 +1790,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT16_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } } @@ -1805,7 +1805,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } } @@ -1813,7 +1813,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32, xnn_init_f16_f32_cvt_wasmsimd_int16_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1823,14 +1823,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U8, batch_eq_8) { VCvtMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U8, batch_div_8) { for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } } @@ -1838,7 +1838,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } } @@ -1846,7 +1846,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1856,14 +1856,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U16, batch_eq_16) { VCvtMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U16, batch_div_16) { for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } } @@ -1871,7 +1871,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } } @@ -1879,7 +1879,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1889,14 +1889,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U24, batch_eq_24) { VCvtMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U24, batch_div_24) { for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } } @@ -1904,7 +1904,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } } @@ -1912,7 +1912,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1922,14 +1922,14 @@ TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U32, batch_eq_32) { VCvtMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } TEST(F16_F32_VCVT__WASMRELAXEDSIMD_INT32_U32, batch_div_32) { for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } } @@ -1937,7 +1937,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } } @@ -1945,7 +1945,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32, xnn_init_f16_f32_cvt_wasmsimd_int32_params); + .Test(xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32); } } #endif // XNN_ARCH_WASMRELAXEDSIMD @@ -1954,14 +1954,14 @@ TEST(F16_F32_VCVT__SCALAR_U1, batch_eq_1) { VCvtMicrokernelTester() .batch_size(1) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1); } TEST(F16_F32_VCVT__SCALAR_U1, batch_gt_1) { for (size_t batch_size = 2; batch_size < 10; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u1); } } @@ -1969,14 +1969,14 @@ TEST(F16_F32_VCVT__SCALAR_U1, batch_gt_1) { TEST(F16_F32_VCVT__SCALAR_U2, batch_eq_2) { VCvtMicrokernelTester() .batch_size(2) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } TEST(F16_F32_VCVT__SCALAR_U2, batch_div_2) { for (size_t batch_size = 4; batch_size < 20; batch_size += 2) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } } @@ -1984,7 +1984,7 @@ TEST(F16_F32_VCVT__SCALAR_U2, batch_lt_2) { for (size_t batch_size = 1; batch_size < 2; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } } @@ -1992,7 +1992,7 @@ TEST(F16_F32_VCVT__SCALAR_U2, batch_gt_2) { for (size_t batch_size = 3; batch_size < 4; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u2); } } @@ -2000,14 +2000,14 @@ TEST(F16_F32_VCVT__SCALAR_U2, batch_gt_2) { TEST(F16_F32_VCVT__SCALAR_U3, batch_eq_3) { VCvtMicrokernelTester() .batch_size(3) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } TEST(F16_F32_VCVT__SCALAR_U3, batch_div_3) { for (size_t batch_size = 6; batch_size < 30; batch_size += 3) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } } @@ -2015,7 +2015,7 @@ TEST(F16_F32_VCVT__SCALAR_U3, batch_lt_3) { for (size_t batch_size = 1; batch_size < 3; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } } @@ -2023,7 +2023,7 @@ TEST(F16_F32_VCVT__SCALAR_U3, batch_gt_3) { for (size_t batch_size = 4; batch_size < 6; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u3); } } @@ -2031,14 +2031,14 @@ TEST(F16_F32_VCVT__SCALAR_U3, batch_gt_3) { TEST(F16_F32_VCVT__SCALAR_U4, batch_eq_4) { VCvtMicrokernelTester() .batch_size(4) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } TEST(F16_F32_VCVT__SCALAR_U4, batch_div_4) { for (size_t batch_size = 8; batch_size < 40; batch_size += 4) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } } @@ -2046,7 +2046,7 @@ TEST(F16_F32_VCVT__SCALAR_U4, batch_lt_4) { for (size_t batch_size = 1; batch_size < 4; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } } @@ -2054,6 +2054,6 @@ TEST(F16_F32_VCVT__SCALAR_U4, batch_gt_4) { for (size_t batch_size = 5; batch_size < 8; batch_size++) { VCvtMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4, xnn_init_f16_f32_cvt_scalar_params); + .Test(xnn_f16_f32_vcvt_ukernel__scalar_u4); } } diff --git a/test/f16-f32-vcvt.yaml b/test/f16-f32-vcvt.yaml index a714e99a4310..4d681fbcef5c 100644 --- a/test/f16-f32-vcvt.yaml +++ b/test/f16-f32-vcvt.yaml @@ -5,73 +5,41 @@ # ARM NEON+FP16ARITH - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u8 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u16 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u24 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int16_u32 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u8 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u16 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u24 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neon_int32_u32 - init: xnn_init_f16_f32_cvt_neon_params - name: xnn_f16_f32_vcvt_ukernel__neonfp16_u8 - name: xnn_f16_f32_vcvt_ukernel__neonfp16_u16 # x86 SSE - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u8 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u16 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u24 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int16_u32 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u8 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u16 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u24 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse2_int32_u32 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u8 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u16 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u24 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int16_u32 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u8 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u16 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u24 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__sse41_int32_u32 - init: xnn_init_f16_f32_cvt_sse_int32_params # x86 AVX - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u8 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u16 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u24 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int16_u32 - init: xnn_init_f16_f32_cvt_sse_int16_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u8 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u16 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u24 - init: xnn_init_f16_f32_cvt_sse_int32_params - name: xnn_f16_f32_vcvt_ukernel__avx_int32_u32 - init: xnn_init_f16_f32_cvt_sse_int32_params # x86 F16C - name: xnn_f16_f32_vcvt_ukernel__f16c_u8 - name: xnn_f16_f32_vcvt_ukernel__f16c_u16 @@ -80,44 +48,24 @@ - name: xnn_f16_f32_vcvt_ukernel__avx512skx_u32 # WAsm SIMD - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmsimd_int32_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params # WAsm Relaxed SIMD - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int16_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u8 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u16 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u24 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params - name: xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int32_u32 - init: xnn_init_f16_f32_cvt_wasmsimd_int32_params # Scalar - name: xnn_f16_f32_vcvt_ukernel__scalar_u1 - init: xnn_init_f16_f32_cvt_scalar_params - name: xnn_f16_f32_vcvt_ukernel__scalar_u2 - init: xnn_init_f16_f32_cvt_scalar_params - name: xnn_f16_f32_vcvt_ukernel__scalar_u3 - init: xnn_init_f16_f32_cvt_scalar_params - name: xnn_f16_f32_vcvt_ukernel__scalar_u4 - init: xnn_init_f16_f32_cvt_scalar_params diff --git a/test/vcvt-microkernel-tester.cc b/test/vcvt-microkernel-tester.cc index 200ba3d1b260..5423249be5f9 100644 --- a/test/vcvt-microkernel-tester.cc +++ b/test/vcvt-microkernel-tester.cc @@ -28,8 +28,7 @@ #include "replicable_random_device.h" void VCvtMicrokernelTester::Test( - xnn_f16_f32_vcvt_ukernel_fn vcvt, - xnn_init_f16_f32_cvt_params_fn init_params) const { + xnn_f16_f32_vcvt_ukernel_fn vcvt) const { xnnpack::ReplicableRandomDevice rng; std::uniform_real_distribution f32dist(-100.0f, 100.0f); @@ -41,13 +40,8 @@ void VCvtMicrokernelTester::Test( [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); std::fill(output.begin(), output.end(), nanf("")); - union xnn_f16_f32_cvt_params params; - if (init_params != nullptr) { - init_params(¶ms); - } - // Call optimized micro-kernel. - vcvt(batch_size() * sizeof(uint16_t), input.data(), output.data(), ¶ms); + vcvt(batch_size() * sizeof(uint16_t), input.data(), output.data(), nullptr); // Verify results. for (size_t i = 0; i < batch_size(); i++) { diff --git a/test/vcvt-microkernel-tester.h b/test/vcvt-microkernel-tester.h index 4d058bcae4da..e6d0562cac07 100644 --- a/test/vcvt-microkernel-tester.h +++ b/test/vcvt-microkernel-tester.h @@ -71,8 +71,7 @@ class VCvtMicrokernelTester { size_t iterations() const { return this->iterations_; } - void Test(xnn_f16_f32_vcvt_ukernel_fn vcvt, - xnn_init_f16_f32_cvt_params_fn init_params = nullptr) const; + void Test(xnn_f16_f32_vcvt_ukernel_fn vcvt) const; void Test(xnn_f32_f16_vcvt_ukernel_fn vcvt, xnn_init_f32_f16_cvt_params_fn init_params = nullptr) const; diff --git a/tools/generate-vcvt-test.py b/tools/generate-vcvt-test.py index 2a13178ce915..b29c652560d6 100755 --- a/tools/generate-vcvt-test.py +++ b/tools/generate-vcvt-test.py @@ -51,15 +51,19 @@ def split_ukernel_name(name): CVT_BENCHMARK_TEMPLATE = """\ BENCHMARK_CAPTURE(${BENCHMARK_FN}, ${BENCHMARK_NAME}, - ${UKERNEL_NAME}, $if INIT_FN and ISA_CHECK: + ${UKERNEL_NAME}, ${INIT_FN}, benchmark::utils::${ISA_CHECK}) $elif INIT_FN: + ${UKERNEL_NAME}, ${INIT_FN}) $elif ISA_CHECK: + ${UKERNEL_NAME}, nullptr /* init params */, benchmark::utils::${ISA_CHECK}) + $else: + ${UKERNEL_NAME}) ->Apply(benchmark::utils::UnaryElementwiseParameters<${INPUT_CTYPE}, ${OUTPUT_CTYPE}>) ->UseRealTime(); """