From 7256bcc1cded0b669a45049d60d9be7e8c5083af Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 12 Aug 2024 03:27:03 -0700 Subject: [PATCH] Move constants to microkernels for sqrt/rsqrt PiperOrigin-RevId: 662021516 --- bench/f32-vrsqrt.cc | 24 +- bench/f32-vsqrt.cc | 30 +-- src/amalgam/gen/avx.c | 16 +- src/amalgam/gen/avx512f.c | 8 +- src/amalgam/gen/fma3.c | 16 +- src/amalgam/gen/sse.c | 8 +- src/configs/unary-elementwise-config.c | 8 - src/f32-vrsqrt/avx-rsqrt.c.in | 8 +- src/f32-vrsqrt/avx512f-rsqrt.c.in | 4 +- src/f32-vrsqrt/fma3-rsqrt.c.in | 8 +- src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u16.c | 8 +- src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u32.c | 8 +- src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u8.c | 8 +- .../gen/f32-vrsqrt-avx512f-rsqrt-u16.c | 4 +- .../gen/f32-vrsqrt-avx512f-rsqrt-u32.c | 4 +- .../gen/f32-vrsqrt-avx512f-rsqrt-u64.c | 4 +- .../gen/f32-vrsqrt-fma3-rsqrt-u16.c | 8 +- .../gen/f32-vrsqrt-fma3-rsqrt-u32.c | 8 +- src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u8.c | 8 +- src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u16.c | 4 +- src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u4.c | 4 +- src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u8.c | 4 +- src/f32-vrsqrt/sse-rsqrt.c.in | 4 +- src/f32-vsqrt/avx-rsqrt.c.in | 8 +- src/f32-vsqrt/avx-sqrt.c.in | 4 +- src/f32-vsqrt/avx512f-nr1fma1adj.c.in | 2 +- src/f32-vsqrt/avx512f-rsqrt.c.in | 4 +- src/f32-vsqrt/fma3-nr1fma1adj.c.in | 2 +- src/f32-vsqrt/fma3-rsqrt.c.in | 8 +- src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u16.c | 8 +- src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u32.c | 8 +- src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u8.c | 8 +- src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u16.c | 4 +- src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u32.c | 4 +- src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u8.c | 4 +- .../gen/f32-vsqrt-avx512f-rsqrt-u16.c | 4 +- .../gen/f32-vsqrt-avx512f-rsqrt-u32.c | 4 +- .../gen/f32-vsqrt-avx512f-rsqrt-u48.c | 4 +- src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u16.c | 8 +- src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u32.c | 8 +- src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u8.c | 8 +- src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u12.c | 4 +- src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u4.c | 4 +- src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u8.c | 4 +- src/f32-vsqrt/sse-rsqrt.c.in | 4 +- src/microparams-init.c | 112 ---------- src/xnnpack/common.h | 9 + src/xnnpack/microparams-init.h | 16 -- src/xnnpack/microparams.h | 42 ---- test/f32-vrsqrt.cc | 120 +++++----- test/f32-vrsqrt.yaml | 12 - test/f32-vsqrt.cc | 210 ++++++++---------- test/f32-vsqrt.yaml | 15 -- test/vunary-microkernel-tester.h | 4 +- 54 files changed, 342 insertions(+), 520 deletions(-) diff --git a/bench/f32-vrsqrt.cc b/bench/f32-vrsqrt.cc index 2d05c442ae7..4aa0a26b1b7 100644 --- a/bench/f32-vrsqrt.cc +++ b/bench/f32-vrsqrt.cc @@ -90,70 +90,70 @@ BENCHMARK_CAPTURE(f32_vrsqrt, scalar_rsqrt_u4, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_vrsqrt, sse_rsqrt_u4, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, - xnn_init_f32_rsqrt_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, sse_rsqrt_u8, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, - xnn_init_f32_rsqrt_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, sse_rsqrt_u16, xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, - xnn_init_f32_rsqrt_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, avx_rsqrt_u8, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, - xnn_init_f32_rsqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, avx_rsqrt_u16, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, - xnn_init_f32_rsqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, avx_rsqrt_u32, xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, - xnn_init_f32_rsqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, fma3_rsqrt_u8, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, - xnn_init_f32_rsqrt_fma3_params, + /*init_params=*/nullptr, benchmark::utils::CheckFMA3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, fma3_rsqrt_u16, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, - xnn_init_f32_rsqrt_fma3_params, + /*init_params=*/nullptr, benchmark::utils::CheckFMA3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, fma3_rsqrt_u32, xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, - xnn_init_f32_rsqrt_fma3_params, + /*init_params=*/nullptr, benchmark::utils::CheckFMA3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, avx512f_rsqrt_u16, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, - xnn_init_f32_rsqrt_avx512_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, avx512f_rsqrt_u32, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, - xnn_init_f32_rsqrt_avx512_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vrsqrt, avx512f_rsqrt_u64, xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, - xnn_init_f32_rsqrt_avx512_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); diff --git a/bench/f32-vsqrt.cc b/bench/f32-vsqrt.cc index 0ad86882fad..d37c9252b3a 100644 --- a/bench/f32-vsqrt.cc +++ b/bench/f32-vsqrt.cc @@ -96,88 +96,88 @@ void f32_vsqrt(benchmark::State& state, xnn_f32_vsqrt_ukernel_fn ukernel, ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, sse_rsqrt_u4, xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, - xnn_init_f32_sqrt_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, sse_rsqrt_u8, xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, - xnn_init_f32_sqrt_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, sse_rsqrt_u12, xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, - xnn_init_f32_sqrt_sse_params) + /*init_params=*/nullptr) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_u8, xnn_f32_vsqrt_ukernel__avx_sqrt_u8, - xnn_init_f32_sqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_u16, xnn_f32_vsqrt_ukernel__avx_sqrt_u16, - xnn_init_f32_sqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx_sqrt_u32, xnn_f32_vsqrt_ukernel__avx_sqrt_u32, - xnn_init_f32_sqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx_rsqrt_u8, xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, - xnn_init_f32_sqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx_rsqrt_u16, xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, - xnn_init_f32_sqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx_rsqrt_u32, xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, - xnn_init_f32_sqrt_avx_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, fma3_rsqrt_u8, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, - xnn_init_f32_sqrt_fma_params, + /*init_params=*/nullptr, benchmark::utils::CheckFMA3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, fma3_rsqrt_u16, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, - xnn_init_f32_sqrt_fma_params, + /*init_params=*/nullptr, benchmark::utils::CheckFMA3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, fma3_rsqrt_u32, xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, - xnn_init_f32_sqrt_fma_params, + /*init_params=*/nullptr, benchmark::utils::CheckFMA3) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx512f_rsqrt_u16, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, - xnn_init_f32_sqrt_avx512_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx512f_rsqrt_u32, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, - xnn_init_f32_sqrt_avx512_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); BENCHMARK_CAPTURE(f32_vsqrt, avx512f_rsqrt_u48, xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, - xnn_init_f32_sqrt_avx512_params, + /*init_params=*/nullptr, benchmark::utils::CheckAVX512F) ->Apply(benchmark::utils::UnaryElementwiseParameters) ->UseRealTime(); diff --git a/src/amalgam/gen/avx.c b/src/amalgam/gen/avx.c index 7b7d7ac7086..268af024e48 100644 --- a/src/amalgam/gen/avx.c +++ b/src/amalgam/gen/avx.c @@ -5881,14 +5881,16 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->avx.three); - const __m256 vhalf = _mm256_load_ps(params->avx.half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vhalf = _mm256_set1_ps(0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -5936,7 +5938,7 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); @@ -6224,14 +6226,16 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u16( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 kThree = _mm256_load_ps(params->avx.three); - const __m256 kHalf = _mm256_load_ps(params->avx.half); + const __m256 kThree = _mm256_set1_ps(3.0f); + const __m256 kHalf = _mm256_set1_ps(0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -6293,7 +6297,7 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u16( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->avx.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/amalgam/gen/avx512f.c b/src/amalgam/gen/avx512f.c index 4adffb18608..d17df7a76c3 100644 --- a/src/amalgam/gen/avx512f.c +++ b/src/amalgam/gen/avx512f.c @@ -3893,8 +3893,8 @@ void xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vthree = _mm512_set1_ps(params->avx512.three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vthree = _mm512_set1_ps(3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 vx0 = _mm512_loadu_ps(input); @@ -4126,8 +4126,8 @@ void xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vneg_three = _mm512_set1_ps(params->avx512.neg_three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vneg_three = _mm512_set1_ps(-3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 vx = _mm512_loadu_ps(input); diff --git a/src/amalgam/gen/fma3.c b/src/amalgam/gen/fma3.c index 90e504dcc79..8120d9043ef 100644 --- a/src/amalgam/gen/fma3.c +++ b/src/amalgam/gen/fma3.c @@ -5376,14 +5376,16 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -5428,7 +5430,7 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->fma3.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); @@ -5462,14 +5464,16 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -5528,7 +5532,7 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->fma3.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/amalgam/gen/sse.c b/src/amalgam/gen/sse.c index 48243467dae..97ad0baee37 100644 --- a/src/amalgam/gen/sse.c +++ b/src/amalgam/gen/sse.c @@ -9453,8 +9453,8 @@ void xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 vx0123 = _mm_loadu_ps(input); @@ -9535,8 +9535,8 @@ void xnn_f32_vsqrt_ukernel__sse_rsqrt_u12( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { const __m128 vx0 = _mm_loadu_ps(input); diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 59a7982b2b4..97219df3be6 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -1406,19 +1406,15 @@ static void init_f32_sqrt_config(void) { assert(hardware_config != NULL); if (hardware_config->use_x86_avx512f) { f32_sqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16; - f32_sqrt_config.init.f32_sqrt = xnn_init_f32_sqrt_avx512_params; f32_sqrt_config.element_tile = 16; } else if (hardware_config->use_x86_fma3) { f32_sqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16; - f32_sqrt_config.init.f32_sqrt = xnn_init_f32_sqrt_fma_params; f32_sqrt_config.element_tile = 16; } else if (hardware_config->use_x86_avx) { f32_sqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__avx_rsqrt_u16; - f32_sqrt_config.init.f32_sqrt = xnn_init_f32_sqrt_avx_params; f32_sqrt_config.element_tile = 16; } else { f32_sqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqrt_ukernel__sse_rsqrt_u12; - f32_sqrt_config.init.f32_sqrt = xnn_init_f32_sqrt_sse_params; f32_sqrt_config.element_tile = 12; } #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD @@ -1452,19 +1448,15 @@ static void init_f32_rsqrt_config(void) { assert(hardware_config != NULL); if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32; - f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_avx512_params; f32_rsqrt_config.element_tile = 32; } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_fma3) { f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16; - f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_fma3_params; f32_rsqrt_config.element_tile = 16; } else if (hardware_config->use_x86_avx) { f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16; - f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_avx_params; f32_rsqrt_config.element_tile = 16; } else { f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8; - f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_sse_params; f32_rsqrt_config.element_tile = 8; } #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/f32-vrsqrt/avx-rsqrt.c.in b/src/f32-vrsqrt/avx-rsqrt.c.in index e205dd51ab5..2efa6ab721e 100644 --- a/src/f32-vrsqrt/avx-rsqrt.c.in +++ b/src/f32-vrsqrt/avx-rsqrt.c.in @@ -38,14 +38,16 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u${BATCH_TILE}( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->avx.three); - const __m256 vhalf = _mm256_load_ps(params->avx.half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vhalf = _mm256_set1_ps(0.5f); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -96,7 +98,7 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u${BATCH_TILE}( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/avx512f-rsqrt.c.in b/src/f32-vrsqrt/avx512f-rsqrt.c.in index 4912b87e3a8..7c1c6e1f97d 100644 --- a/src/f32-vrsqrt/avx512f-rsqrt.c.in +++ b/src/f32-vrsqrt/avx512f-rsqrt.c.in @@ -43,8 +43,8 @@ void xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u${BATCH_TILE}( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vthree = _mm512_set1_ps(params->avx512.three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vthree = _mm512_set1_ps(3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); $if BATCH_TILE > 16: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { diff --git a/src/f32-vrsqrt/fma3-rsqrt.c.in b/src/f32-vrsqrt/fma3-rsqrt.c.in index e3b270ce983..79242f2b23c 100644 --- a/src/f32-vrsqrt/fma3-rsqrt.c.in +++ b/src/f32-vrsqrt/fma3-rsqrt.c.in @@ -39,14 +39,16 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u${BATCH_TILE}( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -94,7 +96,7 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u${BATCH_TILE}( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->fma3.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u16.c b/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u16.c index 482df3ccdaa..62bd085566b 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u16.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u16.c @@ -38,14 +38,16 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->avx.three); - const __m256 vhalf = _mm256_load_ps(params->avx.half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vhalf = _mm256_set1_ps(0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -93,7 +95,7 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u32.c b/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u32.c index 34560af589c..c76624e0569 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u32.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u32.c @@ -38,14 +38,16 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->avx.three); - const __m256 vhalf = _mm256_load_ps(params->avx.half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vhalf = _mm256_set1_ps(0.5f); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -109,7 +111,7 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u8.c b/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u8.c index ad91be254ba..9d2e42bdfa7 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u8.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-avx-rsqrt-u8.c @@ -38,14 +38,16 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->avx.three); - const __m256 vhalf = _mm256_load_ps(params->avx.half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vhalf = _mm256_set1_ps(0.5f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 vx = _mm256_loadu_ps(input); @@ -67,7 +69,7 @@ void xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u16.c b/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u16.c index e847b5e1e1d..c76e7f0dcad 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u16.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u16.c @@ -43,8 +43,8 @@ void xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vthree = _mm512_set1_ps(params->avx512.three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vthree = _mm512_set1_ps(3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 vx = _mm512_loadu_ps(input); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u32.c b/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u32.c index 1fbeef49d47..131fe43e1ff 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u32.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u32.c @@ -43,8 +43,8 @@ void xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vthree = _mm512_set1_ps(params->avx512.three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vthree = _mm512_set1_ps(3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 vx0 = _mm512_loadu_ps(input); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u64.c b/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u64.c index 0c70cc48191..e107a116535 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u64.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-avx512f-rsqrt-u64.c @@ -43,8 +43,8 @@ void xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vthree = _mm512_set1_ps(params->avx512.three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vthree = _mm512_set1_ps(3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 64 * sizeof(float); batch -= 64 * sizeof(float)) { const __m512 vx0 = _mm512_loadu_ps(input); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u16.c b/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u16.c index 04ebda6d5a9..90b587d5383 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u16.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u16.c @@ -39,14 +39,16 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -91,7 +93,7 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->fma3.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u32.c b/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u32.c index 5bccd9a9e27..82a0b5a6199 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u32.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u32.c @@ -39,14 +39,16 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -105,7 +107,7 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->fma3.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u8.c b/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u8.c index 8eb8a97af79..a8e881ca62f 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u8.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-fma3-rsqrt-u8.c @@ -39,14 +39,16 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8( float* output, const union xnn_f32_rsqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 vx = _mm256_loadu_ps(input); @@ -67,7 +69,7 @@ void xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) ¶ms->fma3.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*)((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u16.c b/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u16.c index 8022316efe9..670e687e9c8 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u16.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u16.c @@ -44,8 +44,8 @@ void xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m128 vx0123 = _mm_loadu_ps(input); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u4.c b/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u4.c index 00eee9a6bbb..7b3969eb154 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u4.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u4.c @@ -44,8 +44,8 @@ void xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 vx = _mm_loadu_ps(input); diff --git a/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u8.c b/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u8.c index 667d8c1aad4..e41d0582841 100644 --- a/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u8.c +++ b/src/f32-vrsqrt/gen/f32-vrsqrt-sse-rsqrt-u8.c @@ -44,8 +44,8 @@ void xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 vx0123 = _mm_loadu_ps(input); diff --git a/src/f32-vrsqrt/sse-rsqrt.c.in b/src/f32-vrsqrt/sse-rsqrt.c.in index d849d4174a2..2094d798bd9 100644 --- a/src/f32-vrsqrt/sse-rsqrt.c.in +++ b/src/f32-vrsqrt/sse-rsqrt.c.in @@ -43,8 +43,8 @@ void xnn_f32_vrsqrt_ukernel__sse_rsqrt_u${BATCH_TILE}( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); $if BATCH_TILE > 4: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { diff --git a/src/f32-vsqrt/avx-rsqrt.c.in b/src/f32-vsqrt/avx-rsqrt.c.in index 9853cb8c228..b037efb2d21 100644 --- a/src/f32-vsqrt/avx-rsqrt.c.in +++ b/src/f32-vsqrt/avx-rsqrt.c.in @@ -42,14 +42,16 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u${BATCH_TILE}( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 kThree = _mm256_load_ps(params->avx.three); - const __m256 kHalf = _mm256_load_ps(params->avx.half); + const __m256 kThree = _mm256_set1_ps(3.0f); + const __m256 kHalf = _mm256_set1_ps(0.5f); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -114,7 +116,7 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u${BATCH_TILE}( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->avx.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/avx-sqrt.c.in b/src/f32-vsqrt/avx-sqrt.c.in index 4766ab24c88..f95e92b0b28 100644 --- a/src/f32-vsqrt/avx-sqrt.c.in +++ b/src/f32-vsqrt/avx-sqrt.c.in @@ -20,6 +20,8 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u${BATCH_TILE}( float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -50,7 +52,7 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u${BATCH_TILE}( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); const __m256 vy = _mm256_sqrt_ps(vx); diff --git a/src/f32-vsqrt/avx512f-nr1fma1adj.c.in b/src/f32-vsqrt/avx512f-nr1fma1adj.c.in index bc59c345ca2..ef84f3e5d83 100644 --- a/src/f32-vsqrt/avx512f-nr1fma1adj.c.in +++ b/src/f32-vsqrt/avx512f-nr1fma1adj.c.in @@ -27,7 +27,7 @@ void xnn_f32_vsqrt_ukernel__avx512f_nr1fma1adj_u${BATCH_TILE}( assert(input != NULL); assert(output != NULL); - const __m512 vhalf = _mm512_set1_ps(params->avx512.half); + const __m512 vhalf = _mm512_set1_ps(0.5f); $if BATCH_TILE > 16: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { const __m512 vx${ABC[0]} = _mm512_loadu_ps(input); diff --git a/src/f32-vsqrt/avx512f-rsqrt.c.in b/src/f32-vsqrt/avx512f-rsqrt.c.in index 16e663413fe..c57b78fd225 100644 --- a/src/f32-vsqrt/avx512f-rsqrt.c.in +++ b/src/f32-vsqrt/avx512f-rsqrt.c.in @@ -44,8 +44,8 @@ void xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u${BATCH_TILE}( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vneg_three = _mm512_set1_ps(params->avx512.neg_three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vneg_three = _mm512_set1_ps(-3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); $if BATCH_TILE > 16: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { diff --git a/src/f32-vsqrt/fma3-nr1fma1adj.c.in b/src/f32-vsqrt/fma3-nr1fma1adj.c.in index 59043e6c996..c14c70ed18e 100644 --- a/src/f32-vsqrt/fma3-nr1fma1adj.c.in +++ b/src/f32-vsqrt/fma3-nr1fma1adj.c.in @@ -26,7 +26,7 @@ void xnn_f32_vsqrt_ukernel__fma3_nr1fma1adj_u${BATCH_TILE}( assert(input != NULL); assert(output != NULL); - const __m256 vhalf = _mm256_load_ps(params->fma.half); + const __m256 vhalf = _mm256_set1_ps(0.5f); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { const __m256 vx${ABC[0]} = _mm256_loadu_ps(input); diff --git a/src/f32-vsqrt/fma3-rsqrt.c.in b/src/f32-vsqrt/fma3-rsqrt.c.in index c6b02861191..9b50b9b0cb6 100644 --- a/src/f32-vsqrt/fma3-rsqrt.c.in +++ b/src/f32-vsqrt/fma3-rsqrt.c.in @@ -41,14 +41,16 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u${BATCH_TILE}( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); $if BATCH_TILE > 8: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { @@ -110,7 +112,7 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u${BATCH_TILE}( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->fma3.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u16.c b/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u16.c index 2df4d04e205..0b9b32ea88d 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u16.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u16.c @@ -42,14 +42,16 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u16( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 kThree = _mm256_load_ps(params->avx.three); - const __m256 kHalf = _mm256_load_ps(params->avx.half); + const __m256 kThree = _mm256_set1_ps(3.0f); + const __m256 kHalf = _mm256_set1_ps(0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -111,7 +113,7 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u16( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->avx.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u32.c b/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u32.c index 215e2bfbc71..f9b48ef3f09 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u32.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u32.c @@ -42,14 +42,16 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u32( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 kThree = _mm256_load_ps(params->avx.three); - const __m256 kHalf = _mm256_load_ps(params->avx.half); + const __m256 kThree = _mm256_set1_ps(3.0f); + const __m256 kHalf = _mm256_set1_ps(0.5f); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -133,7 +135,7 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u32( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->avx.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u8.c b/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u8.c index 0611f38568a..7b41bd12de7 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u8.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx-rsqrt-u8.c @@ -42,14 +42,16 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u8( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 kThree = _mm256_load_ps(params->avx.three); - const __m256 kHalf = _mm256_load_ps(params->avx.half); + const __m256 kThree = _mm256_set1_ps(3.0f); + const __m256 kHalf = _mm256_set1_ps(0.5f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 vx = _mm256_loadu_ps(input); @@ -77,7 +79,7 @@ void xnn_f32_vsqrt_ukernel__avx_rsqrt_u8( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->avx.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u16.c b/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u16.c index f83a78ab4c6..3225e73d29a 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u16.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u16.c @@ -21,6 +21,8 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u16( float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -48,7 +50,7 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u16( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); const __m256 vy = _mm256_sqrt_ps(vx); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u32.c b/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u32.c index 07b6c6e9fcb..574c5b78b8c 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u32.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u32.c @@ -21,6 +21,8 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u32( float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -54,7 +56,7 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u32( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); const __m256 vy = _mm256_sqrt_ps(vx); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u8.c b/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u8.c index 7afbbf20b3e..97c56f30e74 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u8.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx-sqrt-u8.c @@ -21,6 +21,8 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u8( float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -36,7 +38,7 @@ void xnn_f32_vsqrt_ukernel__avx_sqrt_u8( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); const __m256 vy = _mm256_sqrt_ps(vx); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u16.c b/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u16.c index 68b87ce4c82..b4781367663 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u16.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u16.c @@ -44,8 +44,8 @@ void xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vneg_three = _mm512_set1_ps(params->avx512.neg_three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vneg_three = _mm512_set1_ps(-3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m512 vx = _mm512_loadu_ps(input); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u32.c b/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u32.c index db23afd1605..3429d43308b 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u32.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u32.c @@ -44,8 +44,8 @@ void xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vneg_three = _mm512_set1_ps(params->avx512.neg_three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vneg_three = _mm512_set1_ps(-3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m512 vx0 = _mm512_loadu_ps(input); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u48.c b/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u48.c index 6cfa2270951..23b1b8a276d 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u48.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-avx512f-rsqrt-u48.c @@ -44,8 +44,8 @@ void xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m512 vneg_three = _mm512_set1_ps(params->avx512.neg_three); - const __m512 vneg_half = _mm512_set1_ps(params->avx512.neg_half); + const __m512 vneg_three = _mm512_set1_ps(-3.0f); + const __m512 vneg_half = _mm512_set1_ps(-0.5f); for (; batch >= 48 * sizeof(float); batch -= 48 * sizeof(float)) { const __m512 vx0 = _mm512_loadu_ps(input); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u16.c b/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u16.c index 78174d04ffb..b4312ff8f4c 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u16.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u16.c @@ -41,14 +41,16 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -107,7 +109,7 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->fma3.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u32.c b/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u32.c index 32a3adfa870..7ef2b81efc3 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u32.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u32.c @@ -41,14 +41,16 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 32 * sizeof(float); batch -= 32 * sizeof(float)) { const __m256 vx0 = _mm256_loadu_ps(input); @@ -127,7 +129,7 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->fma3.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u8.c b/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u8.c index f05c91cfa0f..9d6ba795743 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u8.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-fma3-rsqrt-u8.c @@ -41,14 +41,16 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8( size_t batch, const float* input, float* output, const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m256 vthree = _mm256_load_ps(params->fma3.three); - const __m256 vneg_half = _mm256_load_ps(params->fma3.neg_half); + const __m256 vthree = _mm256_set1_ps(3.0f); + const __m256 vneg_half = _mm256_set1_ps(-0.5f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m256 vx = _mm256_loadu_ps(input); @@ -75,7 +77,7 @@ void xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8( assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); const __m256i vmask = _mm256_loadu_si256( - (const __m256i*)((uintptr_t)¶ms->fma3.mask_table[7] - batch)); + (const __m256i*)((uintptr_t)&mask_table[7] - batch)); const __m256 vx = _mm256_maskload_ps(input, vmask); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u12.c b/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u12.c index 2364325b78f..39babd76bb2 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u12.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u12.c @@ -48,8 +48,8 @@ void xnn_f32_vsqrt_ukernel__sse_rsqrt_u12( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 12 * sizeof(float); batch -= 12 * sizeof(float)) { const __m128 vx0 = _mm_loadu_ps(input); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u4.c b/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u4.c index f0b6bfecfff..9d87bfb5878 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u4.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u4.c @@ -48,8 +48,8 @@ void xnn_f32_vsqrt_ukernel__sse_rsqrt_u4( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) { const __m128 vx = _mm_loadu_ps(input); diff --git a/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u8.c b/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u8.c index 2762db04e55..248477621ef 100644 --- a/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u8.c +++ b/src/f32-vsqrt/gen/f32-vsqrt-sse-rsqrt-u8.c @@ -48,8 +48,8 @@ void xnn_f32_vsqrt_ukernel__sse_rsqrt_u8( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) { const __m128 vx0 = _mm_loadu_ps(input); diff --git a/src/f32-vsqrt/sse-rsqrt.c.in b/src/f32-vsqrt/sse-rsqrt.c.in index ce924149f12..4c61552d436 100644 --- a/src/f32-vsqrt/sse-rsqrt.c.in +++ b/src/f32-vsqrt/sse-rsqrt.c.in @@ -48,8 +48,8 @@ void xnn_f32_vsqrt_ukernel__sse_rsqrt_u${BATCH_TILE}( assert(output != NULL); // Constants for the Newton-Raphson iteration. - const __m128 vthree = _mm_load_ps(params->sse.three); - const __m128 vhalf = _mm_load_ps(params->sse.half); + const __m128 vthree = _mm_set1_ps(3.0f); + const __m128 vhalf = _mm_set1_ps(0.5f); $if BATCH_TILE > 4: for (; batch >= ${BATCH_TILE} * sizeof(float); batch -= ${BATCH_TILE} * sizeof(float)) { diff --git a/src/microparams-init.c b/src/microparams-init.c index ea2f75381fb..ca2062f73e2 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -5011,118 +5011,6 @@ size_t xnn_init_qu8_lrelu_wasmsimd_x86_params( } #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f32_sqrt_sse_params( - union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)]) { - for (uint32_t i = 0; i < 4; i++) { - params->sse.three[i] = 3.0f; - } - for (uint32_t i = 0; i < 4; i++) { - params->sse.half[i] = 0.5f; - } - return sizeof(params->sse); -} - -size_t xnn_init_f32_sqrt_avx_params( - union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 8; i++) { - params->avx.three[i] = 3.0f; - } - for (uint32_t i = 0; i < 8; i++) { - params->avx.half[i] = 0.5f; - } - for (uint32_t i = 0; i < 7; i++) { - params->avx.mask_table[i] = -1; - } - for (uint32_t i = 7; i < 14; i++) { - params->avx.mask_table[i] = 0; - } - return sizeof(params->avx); -} - -size_t xnn_init_f32_sqrt_fma_params( - union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)]) -{ - for (uint32_t i = 0; i < 8; i++) { - params->fma3.three[i] = 3.0f; - } - for (uint32_t i = 0; i < 8; i++) { - params->fma3.neg_half[i] = -0.5f; - } - for (uint32_t i = 0; i < 8; i++) { - params->fma3.half[i] = 0.5f; - } - for (uint32_t i = 0; i < 7; i++) { - params->fma3.mask_table[i] = -1; - } - for (uint32_t i = 7; i < 14; i++) { - params->fma3.mask_table[i] = 0; - } - return sizeof(params->fma3); -} - -size_t xnn_init_f32_sqrt_avx512_params( - union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)]) -{ - params->avx512.neg_three = -3.0f; - params->avx512.neg_half = -0.5f; - params->avx512.half = 0.5f; - return sizeof(params->avx512); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f32_rsqrt_sse_params( - union xnn_f32_rsqrt_params params[XNN_MIN_ELEMENTS(1)]) { - for (uint32_t i = 0; i < 4; i++) { - params->sse.three[i] = 3.0f; - } - for (uint32_t i = 0; i < 4; i++) { - params->sse.half[i] = 0.5f; - } - return sizeof(params->sse); -} -size_t xnn_init_f32_rsqrt_avx_params( - union xnn_f32_rsqrt_params params[XNN_MIN_ELEMENTS(1)]) { - for (uint32_t i = 0; i < 8; i++) { - params->avx.three[i] = 3.0f; - } - for (uint32_t i = 0; i < 8; i++) { - params->avx.half[i] = 0.5f; - } - for (uint32_t i = 0; i < 7; i++) { - params->avx.mask_table[i] = -1; - } - for (uint32_t i = 7; i < 14; i++) { - params->avx.mask_table[i] = 0; - } - return sizeof(params->avx); -} -size_t xnn_init_f32_rsqrt_fma3_params( - union xnn_f32_rsqrt_params params[XNN_MIN_ELEMENTS(1)]) { - for (uint32_t i = 0; i < 8; i++) { - params->fma3.three[i] = 3.0f; - } - for (uint32_t i = 0; i < 8; i++) { - params->fma3.neg_half[i] = -0.5f; - } - for (uint32_t i = 0; i < 7; i++) { - params->fma3.mask_table[i] = -1; - } - for (uint32_t i = 7; i < 14; i++) { - params->fma3.mask_table[i] = 0; - } - return sizeof(params->avx); -} -size_t xnn_init_f32_rsqrt_avx512_params( - union xnn_f32_rsqrt_params params[XNN_MIN_ELEMENTS(1)]) { - params->avx512.three = 3.0f; - params->avx512.neg_half = -0.5f; - return sizeof(params->avx512); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - #if XNN_ARCH_ARM || XNN_ARCH_ARM64 size_t xnn_init_f16_chw_neonfp16arith_stride1_params( union xnn_f16_chw_params params[XNN_MIN_ELEMENTS(1)], diff --git a/src/xnnpack/common.h b/src/xnnpack/common.h index 19eb1d427a2..93069fde067 100644 --- a/src/xnnpack/common.h +++ b/src/xnnpack/common.h @@ -381,6 +381,15 @@ #define XNN_MULTIPASS_EXTRA_BYTES 16 #endif +#if XNN_ARCH_ARM || XNN_ARCH_X86 + // These architectures are slow to broadcast, the compiler tries to move them + // into loops, and when it runs out of registers, it will redundantly perform + // the broadcast. Marking them volatile prevents these from being moved into + // loops, and they spill as broadcasted vectors instead. + #define XNN_FORCE_STACK volatile +#else + #define XNN_FORCE_STACK +#endif #define XNN_LOG2_SIZEOF_INT8_T 0 // log2(sizeof(int8_t)) #define XNN_LOG2_SIZEOF_UINT8_T 0 // log2(sizeof(uint8_t)) diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index e3593dbe4dd..9920a32a48f 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -833,26 +833,10 @@ DECLARE_INIT_QU8_LRELU_PARAMS_FUNCTION(xnn_init_qu8_lrelu_scalar_select_params) XNN_INTERNAL size_t fn_name( \ union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)]); -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F32_SQRT_PARAMS_FUNCTION(xnn_init_f32_sqrt_sse_params) - DECLARE_INIT_F32_SQRT_PARAMS_FUNCTION(xnn_init_f32_sqrt_avx_params) - DECLARE_INIT_F32_SQRT_PARAMS_FUNCTION(xnn_init_f32_sqrt_fma_params) - DECLARE_INIT_F32_SQRT_PARAMS_FUNCTION(xnn_init_f32_sqrt_avx512_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - #define DECLARE_INIT_F32_RSQRT_PARAMS_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ union xnn_f32_rsqrt_params params[XNN_MIN_ELEMENTS(1)]); -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F32_RSQRT_PARAMS_FUNCTION(xnn_init_f32_rsqrt_sse_params) - DECLARE_INIT_F32_RSQRT_PARAMS_FUNCTION(xnn_init_f32_rsqrt_avx_params) - DECLARE_INIT_F32_RSQRT_PARAMS_FUNCTION(xnn_init_f32_rsqrt_fma3_params) - DECLARE_INIT_F32_RSQRT_PARAMS_FUNCTION(xnn_init_f32_rsqrt_avx512_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - #define DECLARE_INIT_F16_CHW_PARAMS_FUNCTION(fn_name) \ XNN_INTERNAL size_t fn_name( \ union xnn_f16_chw_params params[XNN_MIN_ELEMENTS(1)], \ diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index 73437abbb1c..6b9cb9d0b72 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -2836,28 +2836,6 @@ union xnn_f16_sqrt_params { union xnn_f32_sqrt_params { char _; // Dummy member variable to comply with the C standard -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) float three[4]; - XNN_ALIGN(16) float half[4]; - } sse; - struct { - XNN_ALIGN(32) float three[8]; - XNN_ALIGN(32) float half[8]; - int32_t mask_table[14]; - } avx; - struct { - XNN_ALIGN(32) float three[8]; - XNN_ALIGN(32) float neg_half[8]; - XNN_ALIGN(32) float half[8]; - int32_t mask_table[14]; - } fma3; - struct { - float neg_three; - float neg_half; - float half; - } avx512; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 }; // Rsqrt (Reciprocal Square Root): used by VRSQRT microkernels. @@ -2868,26 +2846,6 @@ union xnn_f16_rsqrt_params { union xnn_f32_rsqrt_params { char _; // Dummy member variable to comply with the C standard. -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) float three[4]; - XNN_ALIGN(16) float half[4]; - } sse; - struct { - XNN_ALIGN(32) float three[8]; - XNN_ALIGN(32) float half[8]; - int32_t mask_table[14]; - } avx; - struct { - XNN_ALIGN(32) float three[8]; - XNN_ALIGN(32) float neg_half[8]; - int32_t mask_table[14]; - } fma3; - struct { - float three; - float neg_half; - } avx512; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 }; // TanH (Hyperbolic Tangent): used by VTANH microkernels. diff --git a/test/f32-vrsqrt.cc b/test/f32-vrsqrt.cc index f63907138ae..e56445caa77 100644 --- a/test/f32-vrsqrt.cc +++ b/test/f32-vrsqrt.cc @@ -450,7 +450,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_SSE; VUnaryMicrokernelTester() .batch_size(4) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4); } TEST(F32_VRSQRT__SSE_RSQRT_U4, batch_div_4) { @@ -459,7 +459,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4); } } @@ -469,7 +469,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4); } } @@ -479,7 +479,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4); } } @@ -490,7 +490,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -501,7 +501,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_SSE; VUnaryMicrokernelTester() .batch_size(8) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8); } TEST(F32_VRSQRT__SSE_RSQRT_U8, batch_div_8) { @@ -510,7 +510,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8); } } @@ -520,7 +520,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8); } } @@ -530,7 +530,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8); } } @@ -541,7 +541,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -552,7 +552,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_SSE; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16); } TEST(F32_VRSQRT__SSE_RSQRT_U16, batch_div_16) { @@ -561,7 +561,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16); } } @@ -571,7 +571,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16); } } @@ -581,7 +581,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16); } } @@ -592,7 +592,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16, xnn_init_f32_rsqrt_sse_params); + .Test(xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -603,7 +603,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(8) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8); } TEST(F32_VRSQRT__AVX_RSQRT_U8, batch_div_8) { @@ -612,7 +612,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8); } } @@ -622,7 +622,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8); } } @@ -632,7 +632,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8); } } @@ -643,7 +643,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -654,7 +654,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16); } TEST(F32_VRSQRT__AVX_RSQRT_U16, batch_div_16) { @@ -663,7 +663,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16); } } @@ -673,7 +673,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16); } } @@ -683,7 +683,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16); } } @@ -694,7 +694,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -705,7 +705,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(32) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32); } TEST(F32_VRSQRT__AVX_RSQRT_U32, batch_div_32) { @@ -714,7 +714,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32); } } @@ -724,7 +724,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32); } } @@ -734,7 +734,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32); } } @@ -745,7 +745,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_rsqrt_avx_params); + .Test(xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -756,7 +756,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_FMA3; VUnaryMicrokernelTester() .batch_size(8) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8); } TEST(F32_VRSQRT__FMA3_RSQRT_U8, batch_div_8) { @@ -765,7 +765,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8); } } @@ -775,7 +775,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8); } } @@ -785,7 +785,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8); } } @@ -796,7 +796,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -807,7 +807,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_FMA3; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16); } TEST(F32_VRSQRT__FMA3_RSQRT_U16, batch_div_16) { @@ -816,7 +816,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16); } } @@ -826,7 +826,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16); } } @@ -836,7 +836,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16); } } @@ -847,7 +847,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -858,7 +858,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_FMA3; VUnaryMicrokernelTester() .batch_size(32) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32); } TEST(F32_VRSQRT__FMA3_RSQRT_U32, batch_div_32) { @@ -867,7 +867,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32); } } @@ -877,7 +877,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32); } } @@ -887,7 +887,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32); } } @@ -898,7 +898,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_rsqrt_fma3_params); + .Test(xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -909,7 +909,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_AVX512F; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16); } TEST(F32_VRSQRT__AVX512F_RSQRT_U16, batch_div_16) { @@ -918,7 +918,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16); } } @@ -928,7 +928,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16); } } @@ -938,7 +938,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16); } } @@ -949,7 +949,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -960,7 +960,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_AVX512F; VUnaryMicrokernelTester() .batch_size(32) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32); } TEST(F32_VRSQRT__AVX512F_RSQRT_U32, batch_div_32) { @@ -969,7 +969,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32); } } @@ -979,7 +979,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32); } } @@ -989,7 +989,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32); } } @@ -1000,7 +1000,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1011,7 +1011,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { TEST_REQUIRES_X86_AVX512F; VUnaryMicrokernelTester() .batch_size(64) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64); } TEST(F32_VRSQRT__AVX512F_RSQRT_U64, batch_div_64) { @@ -1020,7 +1020,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64); } } @@ -1030,7 +1030,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64); } } @@ -1040,7 +1040,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64); } } @@ -1051,7 +1051,7 @@ TEST(F32_VRSQRT__SCALAR_RSQRT_U4, inplace) { VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64, xnn_init_f32_rsqrt_avx512_params); + .Test(xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f32-vrsqrt.yaml b/test/f32-vrsqrt.yaml index 871b6bcdca5..86ee338bbc5 100644 --- a/test/f32-vrsqrt.yaml +++ b/test/f32-vrsqrt.yaml @@ -20,32 +20,20 @@ # x86 SSE - name: xnn_f32_vrsqrt_ukernel__sse_rsqrt_u4 - init: xnn_init_f32_rsqrt_sse_params - name: xnn_f32_vrsqrt_ukernel__sse_rsqrt_u8 - init: xnn_init_f32_rsqrt_sse_params - name: xnn_f32_vrsqrt_ukernel__sse_rsqrt_u16 - init: xnn_init_f32_rsqrt_sse_params # x86 AVX - name: xnn_f32_vrsqrt_ukernel__avx_rsqrt_u8 - init: xnn_init_f32_rsqrt_avx_params - name: xnn_f32_vrsqrt_ukernel__avx_rsqrt_u16 - init: xnn_init_f32_rsqrt_avx_params - name: xnn_f32_vrsqrt_ukernel__avx_rsqrt_u32 - init: xnn_init_f32_rsqrt_avx_params # x86 FMA3 - name: xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u8 - init: xnn_init_f32_rsqrt_fma3_params - name: xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16 - init: xnn_init_f32_rsqrt_fma3_params - name: xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u32 - init: xnn_init_f32_rsqrt_fma3_params # x86 AVX512 - name: xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u16 - init: xnn_init_f32_rsqrt_avx512_params - name: xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32 - init: xnn_init_f32_rsqrt_avx512_params - name: xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u64 - init: xnn_init_f32_rsqrt_avx512_params diff --git a/test/f32-vsqrt.cc b/test/f32-vsqrt.cc index b3e33af640c..c247c4f998c 100644 --- a/test/f32-vsqrt.cc +++ b/test/f32-vsqrt.cc @@ -820,7 +820,7 @@ TEST_REQUIRES_X86_SSE; VUnaryMicrokernelTester() .batch_size(4) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4); } TEST(F32_VSQRT__SSE_RSQRT_U4, batch_div_4) { @@ -829,7 +829,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4); } } @@ -839,7 +839,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4); } } @@ -849,7 +849,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4); } } @@ -860,7 +860,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u4); } } @@ -874,10 +874,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_sse_params(¶ms); xnn_f32_vsqrt_ukernel__sse_rsqrt_u4( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -901,7 +899,7 @@ TEST_REQUIRES_X86_SSE; VUnaryMicrokernelTester() .batch_size(8) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8); } TEST(F32_VSQRT__SSE_RSQRT_U8, batch_div_8) { @@ -910,7 +908,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8); } } @@ -920,7 +918,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8); } } @@ -930,7 +928,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8); } } @@ -941,7 +939,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u8); } } @@ -955,10 +953,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_sse_params(¶ms); xnn_f32_vsqrt_ukernel__sse_rsqrt_u8( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -982,7 +978,7 @@ TEST_REQUIRES_X86_SSE; VUnaryMicrokernelTester() .batch_size(12) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12); } TEST(F32_VSQRT__SSE_RSQRT_U12, batch_div_12) { @@ -991,7 +987,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12); } } @@ -1001,7 +997,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12); } } @@ -1011,7 +1007,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12); } } @@ -1022,7 +1018,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12, xnn_init_f32_sqrt_sse_params); + .Test(xnn_f32_vsqrt_ukernel__sse_rsqrt_u12); } } @@ -1036,10 +1032,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_sse_params(¶ms); xnn_f32_vsqrt_ukernel__sse_rsqrt_u12( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1063,7 +1057,7 @@ TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(8) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8); } TEST(F32_VSQRT__AVX_SQRT_U8, batch_div_8) { @@ -1072,7 +1066,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8); } } @@ -1082,7 +1076,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8); } } @@ -1092,7 +1086,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8); } } @@ -1103,7 +1097,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u8); } } @@ -1117,10 +1111,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx_params(¶ms); xnn_f32_vsqrt_ukernel__avx_sqrt_u8( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1144,7 +1136,7 @@ TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16); } TEST(F32_VSQRT__AVX_SQRT_U16, batch_div_16) { @@ -1153,7 +1145,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16); } } @@ -1163,7 +1155,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16); } } @@ -1173,7 +1165,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16); } } @@ -1184,7 +1176,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u16); } } @@ -1198,10 +1190,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx_params(¶ms); xnn_f32_vsqrt_ukernel__avx_sqrt_u16( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1225,7 +1215,7 @@ TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(32) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32); } TEST(F32_VSQRT__AVX_SQRT_U32, batch_div_32) { @@ -1234,7 +1224,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32); } } @@ -1244,7 +1234,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32); } } @@ -1254,7 +1244,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32); } } @@ -1265,7 +1255,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_sqrt_u32); } } @@ -1279,10 +1269,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx_params(¶ms); xnn_f32_vsqrt_ukernel__avx_sqrt_u32( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1306,7 +1294,7 @@ TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(8) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8); } TEST(F32_VSQRT__AVX_RSQRT_U8, batch_div_8) { @@ -1315,7 +1303,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8); } } @@ -1325,7 +1313,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8); } } @@ -1335,7 +1323,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8); } } @@ -1346,7 +1334,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u8); } } @@ -1360,10 +1348,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx_params(¶ms); xnn_f32_vsqrt_ukernel__avx_rsqrt_u8( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1387,7 +1373,7 @@ TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16); } TEST(F32_VSQRT__AVX_RSQRT_U16, batch_div_16) { @@ -1396,7 +1382,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16); } } @@ -1406,7 +1392,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16); } } @@ -1416,7 +1402,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16); } } @@ -1427,7 +1413,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u16); } } @@ -1441,10 +1427,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx_params(¶ms); xnn_f32_vsqrt_ukernel__avx_rsqrt_u16( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1468,7 +1452,7 @@ TEST_REQUIRES_X86_AVX; VUnaryMicrokernelTester() .batch_size(32) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32); } TEST(F32_VSQRT__AVX_RSQRT_U32, batch_div_32) { @@ -1477,7 +1461,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32); } } @@ -1487,7 +1471,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32); } } @@ -1497,7 +1481,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32); } } @@ -1508,7 +1492,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32, xnn_init_f32_sqrt_avx_params); + .Test(xnn_f32_vsqrt_ukernel__avx_rsqrt_u32); } } @@ -1522,10 +1506,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx_params(¶ms); xnn_f32_vsqrt_ukernel__avx_rsqrt_u32( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1549,7 +1531,7 @@ TEST_REQUIRES_X86_FMA3; VUnaryMicrokernelTester() .batch_size(8) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8); } TEST(F32_VSQRT__FMA3_RSQRT_U8, batch_div_8) { @@ -1558,7 +1540,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8); } } @@ -1568,7 +1550,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8); } } @@ -1578,7 +1560,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8); } } @@ -1589,7 +1571,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8); } } @@ -1603,10 +1585,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_fma_params(¶ms); xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1630,7 +1610,7 @@ TEST_REQUIRES_X86_FMA3; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16); } TEST(F32_VSQRT__FMA3_RSQRT_U16, batch_div_16) { @@ -1639,7 +1619,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16); } } @@ -1649,7 +1629,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16); } } @@ -1659,7 +1639,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16); } } @@ -1670,7 +1650,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16); } } @@ -1684,10 +1664,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_fma_params(¶ms); xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1711,7 +1689,7 @@ TEST_REQUIRES_X86_FMA3; VUnaryMicrokernelTester() .batch_size(32) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32); } TEST(F32_VSQRT__FMA3_RSQRT_U32, batch_div_32) { @@ -1720,7 +1698,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32); } } @@ -1730,7 +1708,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32); } } @@ -1740,7 +1718,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32); } } @@ -1751,7 +1729,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32, xnn_init_f32_sqrt_fma_params); + .Test(xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32); } } @@ -1765,10 +1743,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_fma_params(¶ms); xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1792,7 +1768,7 @@ TEST_REQUIRES_X86_AVX512F; VUnaryMicrokernelTester() .batch_size(16) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16); } TEST(F32_VSQRT__AVX512F_RSQRT_U16, batch_div_16) { @@ -1801,7 +1777,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16); } } @@ -1811,7 +1787,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16); } } @@ -1821,7 +1797,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16); } } @@ -1832,7 +1808,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16); } } @@ -1846,10 +1822,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx512_params(¶ms); xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1873,7 +1847,7 @@ TEST_REQUIRES_X86_AVX512F; VUnaryMicrokernelTester() .batch_size(32) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32); } TEST(F32_VSQRT__AVX512F_RSQRT_U32, batch_div_32) { @@ -1882,7 +1856,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32); } } @@ -1892,7 +1866,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32); } } @@ -1902,7 +1876,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32); } } @@ -1913,7 +1887,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32); } } @@ -1927,10 +1901,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx512_params(¶ms); xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( @@ -1954,7 +1926,7 @@ TEST_REQUIRES_X86_AVX512F; VUnaryMicrokernelTester() .batch_size(48) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48); } TEST(F32_VSQRT__AVX512F_RSQRT_U48, batch_div_48) { @@ -1963,7 +1935,7 @@ for (size_t batch_size = 2 * batch_step; batch_size < 10 * batch_step; batch_size += batch_step) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48); } } @@ -1973,7 +1945,7 @@ for (size_t batch_size = 1; batch_size < batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48); } } @@ -1983,7 +1955,7 @@ for (size_t batch_size = batch_step + 1; batch_size < 2 * batch_step; batch_size++) { VUnaryMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48); } } @@ -1994,7 +1966,7 @@ VUnaryMicrokernelTester() .batch_size(batch_size) .inplace(true) - .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48, xnn_init_f32_sqrt_avx512_params); + .Test(xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48); } } @@ -2008,10 +1980,8 @@ std::array expected = {0.0f, -0.0f, 1.0f, NAN}; std::array outputs; - union xnn_f32_sqrt_params params; - xnn_init_f32_sqrt_avx512_params(¶ms); xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48( - num_elements * sizeof(float), inputs.data(), outputs.data(), ¶ms); + num_elements * sizeof(float), inputs.data(), outputs.data(), nullptr); for (int i = 0; i < num_elements; i++) { if (std::isfinite(expected[i])) { EXPECT_NEAR( diff --git a/test/f32-vsqrt.yaml b/test/f32-vsqrt.yaml index e9fa74ad2be..1e7d40bbf21 100644 --- a/test/f32-vsqrt.yaml +++ b/test/f32-vsqrt.yaml @@ -20,42 +20,27 @@ - name: xnn_f32_vsqrt_ukernel__sse_sqrt_u16 - name: xnn_f32_vsqrt_ukernel__sse_rsqrt_u4 - init: xnn_init_f32_sqrt_sse_params - name: xnn_f32_vsqrt_ukernel__sse_rsqrt_u8 - init: xnn_init_f32_sqrt_sse_params - name: xnn_f32_vsqrt_ukernel__sse_rsqrt_u12 - init: xnn_init_f32_sqrt_sse_params # x86 AVX - name: xnn_f32_vsqrt_ukernel__avx_sqrt_u8 - init: xnn_init_f32_sqrt_avx_params - name: xnn_f32_vsqrt_ukernel__avx_sqrt_u16 - init: xnn_init_f32_sqrt_avx_params - name: xnn_f32_vsqrt_ukernel__avx_sqrt_u32 - init: xnn_init_f32_sqrt_avx_params - name: xnn_f32_vsqrt_ukernel__avx_rsqrt_u8 - init: xnn_init_f32_sqrt_avx_params - name: xnn_f32_vsqrt_ukernel__avx_rsqrt_u16 - init: xnn_init_f32_sqrt_avx_params - name: xnn_f32_vsqrt_ukernel__avx_rsqrt_u32 - init: xnn_init_f32_sqrt_avx_params # x86 FMA3 - name: xnn_f32_vsqrt_ukernel__fma3_rsqrt_u8 - init: xnn_init_f32_sqrt_fma_params - name: xnn_f32_vsqrt_ukernel__fma3_rsqrt_u16 - init: xnn_init_f32_sqrt_fma_params - name: xnn_f32_vsqrt_ukernel__fma3_rsqrt_u32 - init: xnn_init_f32_sqrt_fma_params # x86 AVX512 - name: xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u16 - init: xnn_init_f32_sqrt_avx512_params - name: xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u32 - init: xnn_init_f32_sqrt_avx512_params - name: xnn_f32_vsqrt_ukernel__avx512f_rsqrt_u48 - init: xnn_init_f32_sqrt_avx512_params # WAsm SIMD - name: xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_u4 diff --git a/test/vunary-microkernel-tester.h b/test/vunary-microkernel-tester.h index 492f43f5763..d7c59ab8856 100644 --- a/test/vunary-microkernel-tester.h +++ b/test/vunary-microkernel-tester.h @@ -174,10 +174,10 @@ class VUnaryMicrokernelTester { xnn_init_f32_default_params_fn init_params = nullptr) const; void Test(xnn_f16_vhswish_ukernel_fn vhswish, - xnn_init_f16_hswish_params_fn init_params) const; + xnn_init_f16_hswish_params_fn init_params = nullptr) const; void Test(xnn_f32_vhswish_ukernel_fn vhswish, - xnn_init_f32_hswish_params_fn init_params) const; + xnn_init_f32_hswish_params_fn init_params = nullptr) const; void Test(xnn_f16_vlrelu_ukernel_fn vlrelu, xnn_init_f16_lrelu_params_fn init_params) const;