diff --git a/bench/f16-f32acc-rdsum.cc b/bench/f16-f32acc-rdsum.cc index b28816287dc6..75d4b6d3be28 100644 --- a/bench/f16-f32acc-rdsum.cc +++ b/bench/f16-f32acc-rdsum.cc @@ -52,7 +52,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c16, xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRDSUM) ->UseRealTime(); @@ -62,7 +62,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c32, xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRDSUM) ->UseRealTime(); @@ -72,7 +72,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c64, xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRDSUM) ->UseRealTime(); @@ -82,7 +82,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c128, xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRDSUM) ->UseRealTime(); diff --git a/bench/f16-f32acc-rsum.cc b/bench/f16-f32acc-rsum.cc index d3d24f52ba0c..1620054fb3b9 100644 --- a/bench/f16-f32acc-rsum.cc +++ b/bench/f16-f32acc-rsum.cc @@ -82,7 +82,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u8, xnn_f16_f32acc_rsum_ukernel__f16c_u8, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -92,7 +92,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u16_acc2, xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -102,7 +102,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u24_acc3, xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -112,7 +112,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u32_acc2, xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -122,7 +122,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u32_acc4, xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, - xnn_init_f16_f32acc_scale_avx_params, + xnn_init_f16_f32acc_scale_scalar_params, benchmark::utils::CheckF16C) ->Apply(BenchmarkRSUM) ->UseRealTime(); diff --git a/bench/f32-rdsum.cc b/bench/f32-rdsum.cc index 448bab07a434..01c2b12bf9d0 100644 --- a/bench/f32-rdsum.cc +++ b/bench/f32-rdsum.cc @@ -58,7 +58,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rdsum, sse_c16, xnn_f32_rdsum_ukernel_7p7x__sse_c16, - xnn_init_f32_scaleminmax_sse_params) + xnn_init_f32_scaleminmax_scalar_params) ->Apply(BenchmarkRDSUM) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -67,7 +67,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rdsum, sse_c32, xnn_f32_rdsum_ukernel_7p7x__sse_c32, - xnn_init_f32_scaleminmax_sse_params) + xnn_init_f32_scaleminmax_scalar_params) ->Apply(BenchmarkRDSUM) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -76,7 +76,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rdsum, sse_c64, xnn_f32_rdsum_ukernel_7p7x__sse_c64, - xnn_init_f32_scaleminmax_sse_params) + xnn_init_f32_scaleminmax_scalar_params) ->Apply(BenchmarkRDSUM) ->UseRealTime(); #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -85,7 +85,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rdsum, avx_c16, xnn_f32_rdsum_ukernel_7p7x__avx_c16, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRDSUM) ->UseRealTime(); @@ -95,7 +95,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rdsum, avx_c32, xnn_f32_rdsum_ukernel_7p7x__avx_c32, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRDSUM) ->UseRealTime(); @@ -105,7 +105,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4, #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rdsum, avx_c64, xnn_f32_rdsum_ukernel_7p7x__avx_c64, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRDSUM) ->UseRealTime(); diff --git a/bench/f32-rsum.cc b/bench/f32-rsum.cc index d72489180f4e..2f41378e6cde 100644 --- a/bench/f32-rsum.cc +++ b/bench/f32-rsum.cc @@ -117,7 +117,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rsum, avx_u8, xnn_f32_rsum_ukernel__avx_u8, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -127,7 +127,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rsum, avx_u16_acc2, xnn_f32_rsum_ukernel__avx_u16_acc2, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -137,7 +137,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rsum, avx_u24_acc3, xnn_f32_rsum_ukernel__avx_u24_acc3, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -147,7 +147,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rsum, avx_u32_acc2, xnn_f32_rsum_ukernel__avx_u32_acc2, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRSUM) ->UseRealTime(); @@ -157,7 +157,7 @@ #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f32_rsum, avx_u32_acc4, xnn_f32_rsum_ukernel__avx_u32_acc4, - xnn_init_f32_scaleminmax_avx_params, + xnn_init_f32_scaleminmax_scalar_params, benchmark::utils::CheckAVX) ->Apply(BenchmarkRSUM) ->UseRealTime(); diff --git a/src/amalgam/gen/avx.c b/src/amalgam/gen/avx.c index eb17d60e267d..1b96fabbff1d 100644 --- a/src/amalgam/gen/avx.c +++ b/src/amalgam/gen/avx.c @@ -3655,14 +3655,16 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c32( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(rows != 0); assert(channels != 0); assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); - const __m256 vmin = _mm256_set1_ps(params->avx.min); - const __m256 vmax = _mm256_set1_ps(params->avx.max); + const __m256 vscale = _mm256_set1_ps(params->scalar.scale); + const __m256 vmin = _mm256_set1_ps(params->scalar.min); + const __m256 vmax = _mm256_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 32; channels -= 32) { @@ -3844,7 +3846,7 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c32( } if (remainder) { - vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - (channels & 0x7) * sizeof(float))); + vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - (channels & 0x7) * sizeof(float))); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i0[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i1[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i2[num_full_chunks*8], vmask), vacc[num_full_chunks]); @@ -4028,6 +4030,8 @@ void xnn_f32_rsum_ukernel__avx_u32_acc4( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -4061,16 +4065,16 @@ void xnn_f32_rsum_ukernel__avx_u32_acc4( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vt = _mm256_maskload_ps(input, vmask); vacc0 = _mm256_add_ps(vacc0, vt); } __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); - vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->avx.min)); - vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->avx.max)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->scalar.min)); + vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->scalar.max)); *output += _mm_cvtss_f32(vacc); } diff --git a/src/amalgam/gen/avx512skx.c b/src/amalgam/gen/avx512skx.c index 21d4a8f9f29e..d626895d209e 100644 --- a/src/amalgam/gen/avx512skx.c +++ b/src/amalgam/gen/avx512skx.c @@ -71,7 +71,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c64( assert(input != NULL); assert(output != NULL); - const __m512 vscale = _mm512_set1_ps(params->scalar.scale); + const __m512 vscale = _mm512_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 64; channels -= 64) { @@ -343,7 +343,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc4( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/amalgam/gen/f16c.c b/src/amalgam/gen/f16c.c index 809dc311db79..43ba84a7c923 100644 --- a/src/amalgam/gen/f16c.c +++ b/src/amalgam/gen/f16c.c @@ -601,7 +601,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32( assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); + const __m256 vscale = _mm256_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 32; channels -= 32) { @@ -837,6 +837,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4( float* output, const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); @@ -871,7 +873,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); - const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch)); const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask)); const __m256 vt = _mm256_cvtph_ps(vh); vacc0 = _mm256_add_ps(vacc0, vt); @@ -885,7 +887,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/amalgam/gen/neonfp16arith.c b/src/amalgam/gen/neonfp16arith.c index f333c9b0d4b7..ea73ee110f6b 100644 --- a/src/amalgam/gen/neonfp16arith.c +++ b/src/amalgam/gen/neonfp16arith.c @@ -4276,7 +4276,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c16( assert(input != NULL); assert(output != NULL); - const float32x4_t vscale = vdupq_n_f32(params->scalar.scale); + const float32x4_t vscale = vld1q_dup_f32(¶ms->scale); size_t input_increment = 7 * input_stride; for (; channels >= 16; channels -= 16) { @@ -4532,7 +4532,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc4( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/amalgam/gen/sse.c b/src/amalgam/gen/sse.c index b635ec371f20..48243467dae1 100644 --- a/src/amalgam/gen/sse.c +++ b/src/amalgam/gen/sse.c @@ -7390,9 +7390,9 @@ void xnn_f32_rdsum_ukernel_7p7x__sse_c16( assert(input != NULL); assert(output != NULL); - const __m128 vscale = _mm_load_ps(params->sse.scale); - const __m128 vmin = _mm_load_ps(params->sse.min); - const __m128 vmax = _mm_load_ps(params->sse.max); + const __m128 vscale = _mm_set1_ps(params->scalar.scale); + const __m128 vmin = _mm_set1_ps(params->scalar.min); + const __m128 vmax = _mm_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 16; channels -= 16) { diff --git a/src/configs/reduce-config.c b/src/configs/reduce-config.c index 26d84def5dd8..e0838b5b6278 100644 --- a/src/configs/reduce-config.c +++ b/src/configs/reduce-config.c @@ -50,7 +50,7 @@ static void init_f16_f32acc_rsum_config(void) { } else if (hardware_config->use_x86_f16c) { f16_f32acc_rsum_config = (struct xnn_reduce_config) { .ukernel = (xnn_reduce_ukernel_fn) xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, - .init.f16_f32acc_scale = xnn_init_f16_f32acc_scale_avx_params, + .init.f16_f32acc_scale = xnn_init_f16_f32acc_scale_scalar_params, .element_tile = 32, }; } @@ -184,7 +184,7 @@ static void init_f32_rsum_config(void) { } else if (hardware_config->use_x86_avx) { f32_rsum_config = (struct xnn_reduce_config) { .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rsum_ukernel__avx_u32_acc4, - .init.f32_scaleminmax = xnn_init_f32_scaleminmax_avx_params, + .init.f32_scaleminmax = xnn_init_f32_scaleminmax_scalar_params, .element_tile = 32, }; } else { @@ -232,7 +232,7 @@ static void init_f16_f32acc_rdsum_config(void) { } else if (hardware_config->use_x86_f16c) { f16_f32acc_rdsum_config = (struct xnn_reduce_config) { .rd_ukernel = (xnn_rdsum_ukernel_fn) xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, - .init.f16_f32acc_scale = xnn_init_f16_f32acc_scale_avx_params, + .init.f16_f32acc_scale = xnn_init_f16_f32acc_scale_scalar_params, .element_tile = 32, }; } @@ -274,13 +274,13 @@ static void init_f32_rdsum_config(void) { } else if (hardware_config->use_x86_avx) { f32_rdsum_config = (struct xnn_reduce_config) { .rd_ukernel = (xnn_rdsum_ukernel_fn) xnn_f32_rdsum_ukernel_7p7x__avx_c32, - .init.f32_scaleminmax = xnn_init_f32_scaleminmax_avx_params, + .init.f32_scaleminmax = xnn_init_f32_scaleminmax_scalar_params, .element_tile = 32, }; } else { f32_rdsum_config = (struct xnn_reduce_config) { .rd_ukernel = (xnn_rdsum_ukernel_fn) xnn_f32_rdsum_ukernel_7p7x__sse_c16, - .init.f32_scaleminmax = xnn_init_f32_scaleminmax_sse_params, + .init.f32_scaleminmax = xnn_init_f32_scaleminmax_scalar_params, .element_tile = 16, }; } diff --git a/src/f16-f32acc-rdsum/avx.c.in b/src/f16-f32acc-rdsum/avx.c.in index 5a5dd4a01f23..afd1412583cf 100644 --- a/src/f16-f32acc-rdsum/avx.c.in +++ b/src/f16-f32acc-rdsum/avx.c.in @@ -29,7 +29,7 @@ void xnn_f16_f32acc_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__f16c_c${CHAN assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); + const __m256 vscale = _mm256_set1_ps(params->scale); size_t input_increment = ${ACCUMULATORS} * input_stride; for (; channels >= ${CHANNELS_BATCH}; channels -= ${CHANNELS_BATCH}) { diff --git a/src/f16-f32acc-rdsum/avx512skx.c.in b/src/f16-f32acc-rdsum/avx512skx.c.in index e264ba89a8ef..a61b0d9410b2 100644 --- a/src/f16-f32acc-rdsum/avx512skx.c.in +++ b/src/f16-f32acc-rdsum/avx512skx.c.in @@ -28,7 +28,7 @@ void xnn_f16_f32acc_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__avx512skx_c$ assert(input != NULL); assert(output != NULL); - const __m512 vscale = _mm512_set1_ps(params->scalar.scale); + const __m512 vscale = _mm512_set1_ps(params->scale); size_t input_increment = ${ACCUMULATORS} * input_stride; for (; channels >= ${CHANNELS_BATCH}; channels -= ${CHANNELS_BATCH}) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c128.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c128.c index e5c971d8c765..0c051773e564 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c128.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c128.c @@ -30,7 +30,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c128( assert(input != NULL); assert(output != NULL); - const __m512 vscale = _mm512_set1_ps(params->scalar.scale); + const __m512 vscale = _mm512_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 128; channels -= 128) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c16.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c16.c index ac249738ce61..41f6e1b38055 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c16.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c16.c @@ -30,7 +30,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c16( assert(input != NULL); assert(output != NULL); - const __m512 vscale = _mm512_set1_ps(params->scalar.scale); + const __m512 vscale = _mm512_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 16; channels -= 16) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c32.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c32.c index 5a357433ef1a..684c9a7a4ee4 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c32.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c32.c @@ -30,7 +30,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c32( assert(input != NULL); assert(output != NULL); - const __m512 vscale = _mm512_set1_ps(params->scalar.scale); + const __m512 vscale = _mm512_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 32; channels -= 32) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c64.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c64.c index 16077c86bcba..1d9dde05e532 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c64.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-c64.c @@ -30,7 +30,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c64( assert(input != NULL); assert(output != NULL); - const __m512 vscale = _mm512_set1_ps(params->scalar.scale); + const __m512 vscale = _mm512_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 64; channels -= 64) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c128.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c128.c index 1f046137c927..cf1c697e01c3 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c128.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c128.c @@ -31,7 +31,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128( assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); + const __m256 vscale = _mm256_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 128; channels -= 128) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c16.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c16.c index 220665074555..a3f997d44364 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c16.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c16.c @@ -31,7 +31,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16( assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); + const __m256 vscale = _mm256_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 16; channels -= 16) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c index b266bb40a3cd..129638a0393b 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c @@ -31,7 +31,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32( assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); + const __m256 vscale = _mm256_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 32; channels -= 32) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c64.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c64.c index edda80178701..a3e69418b13b 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c64.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c64.c @@ -31,7 +31,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64( assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); + const __m256 vscale = _mm256_set1_ps(params->scale); size_t input_increment = 7 * input_stride; for (; channels >= 64; channels -= 64) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c16.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c16.c index f447c9ca238a..b69230579b19 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c16.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c16.c @@ -30,7 +30,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c16( assert(input != NULL); assert(output != NULL); - const float32x4_t vscale = vdupq_n_f32(params->scalar.scale); + const float32x4_t vscale = vld1q_dup_f32(¶ms->scale); size_t input_increment = 7 * input_stride; for (; channels >= 16; channels -= 16) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c32.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c32.c index a240b3ae8d64..784367f5565d 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c32.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c32.c @@ -30,7 +30,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c32( assert(input != NULL); assert(output != NULL); - const float32x4_t vscale = vdupq_n_f32(params->scalar.scale); + const float32x4_t vscale = vld1q_dup_f32(¶ms->scale); size_t input_increment = 7 * input_stride; for (; channels >= 32; channels -= 32) { diff --git a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c64.c b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c64.c index 2919439bc7d3..64f0a05c948a 100644 --- a/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c64.c +++ b/src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c64.c @@ -30,7 +30,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c64( assert(input != NULL); assert(output != NULL); - const float32x4_t vscale = vdupq_n_f32(params->scalar.scale); + const float32x4_t vscale = vld1q_dup_f32(¶ms->scale); size_t input_increment = 7 * input_stride; for (; channels >= 64; channels -= 64) { diff --git a/src/f16-f32acc-rdsum/neon.c.in b/src/f16-f32acc-rdsum/neon.c.in index 2fe0c58e379b..8cc3c9a48722 100644 --- a/src/f16-f32acc-rdsum/neon.c.in +++ b/src/f16-f32acc-rdsum/neon.c.in @@ -28,7 +28,7 @@ void xnn_f16_f32acc_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__neonfp16arit assert(input != NULL); assert(output != NULL); - const float32x4_t vscale = vdupq_n_f32(params->scalar.scale); + const float32x4_t vscale = vld1q_dup_f32(¶ms->scale); size_t input_increment = ${ACCUMULATORS} * input_stride; for (; channels >= ${CHANNELS_BATCH}; channels -= ${CHANNELS_BATCH}) { diff --git a/src/f16-f32acc-rsum/avx512skx.c.in b/src/f16-f32acc-rsum/avx512skx.c.in index 06be7def0f9c..d5f5ccac22eb 100644 --- a/src/f16-f32acc-rsum/avx512skx.c.in +++ b/src/f16-f32acc-rsum/avx512skx.c.in @@ -72,7 +72,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u${BATCH_TILE}${ACC_SUFFIX}( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/f16c.c.in b/src/f16-f32acc-rsum/f16c.c.in index acea5376be9f..df78c0dbfe9e 100644 --- a/src/f16-f32acc-rsum/f16c.c.in +++ b/src/f16-f32acc-rsum/f16c.c.in @@ -25,6 +25,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u${BATCH_TILE}${ACC_SUFFIX}( float* output, const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); @@ -59,7 +61,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u${BATCH_TILE}${ACC_SUFFIX}( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); - const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch)); const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask)); const __m256 vt = _mm256_cvtph_ps(vh); vacc0 = _mm256_add_ps(vacc0, vt); @@ -73,7 +75,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u${BATCH_TILE}${ACC_SUFFIX}( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u128-acc4.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u128-acc4.c index e4993efb4c1b..3a9839916851 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u128-acc4.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u128-acc4.c @@ -78,7 +78,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u128_acc4( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u16.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u16.c index bb33703d7753..3cc89342922d 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u16.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u16.c @@ -52,7 +52,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u16( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u32-acc2.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u32-acc2.c index b3644097905e..e08116311060 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u32-acc2.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u32-acc2.c @@ -62,7 +62,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u32_acc2( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u48-acc3.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u48-acc3.c index 175fb2b7ff44..49cff77ad2c2 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u48-acc3.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u48-acc3.c @@ -66,7 +66,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u48_acc3( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc2.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc2.c index c17f0ade203f..fe79f770ab9a 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc2.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc2.c @@ -66,7 +66,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc2( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc4.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc4.c index 037a250533b8..dfe376213e0a 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc4.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u64-acc4.c @@ -70,7 +70,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc4( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c index c3126d7ffec9..9d2231a455f6 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c @@ -23,6 +23,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2( float* output, const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); @@ -49,7 +51,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); - const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch)); const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask)); const __m256 vt = _mm256_cvtph_ps(vh); vacc0 = _mm256_add_ps(vacc0, vt); @@ -63,7 +65,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c index edd1b324d891..797c9e2d727e 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c @@ -23,6 +23,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3( float* output, const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); @@ -53,7 +55,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); - const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch)); const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask)); const __m256 vt = _mm256_cvtph_ps(vh); vacc0 = _mm256_add_ps(vacc0, vt); @@ -67,7 +69,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc2.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc2.c index 472aa7e719e7..4643ac3cf857 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc2.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc2.c @@ -23,6 +23,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2( float* output, const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); @@ -53,7 +55,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); - const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch)); const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask)); const __m256 vt = _mm256_cvtph_ps(vh); vacc0 = _mm256_add_ps(vacc0, vt); @@ -67,7 +69,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc4.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc4.c index a880061e1a0e..2d2b62a97599 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc4.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc4.c @@ -23,6 +23,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4( float* output, const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); @@ -57,7 +59,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); - const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch)); const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask)); const __m256 vt = _mm256_cvtph_ps(vh); vacc0 = _mm256_add_ps(vacc0, vt); @@ -71,7 +73,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u8.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u8.c index d9d3b7f6d418..e6907e2f4975 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u8.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u8.c @@ -23,6 +23,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u8( float* output, const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(uint16_t) == 0); assert(input != NULL); @@ -39,7 +41,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u8( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(uint16_t)); assert(batch <= 7 * sizeof(uint16_t)); - const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch)); const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask)); const __m256 vt = _mm256_cvtph_ps(vh); vacc0 = _mm256_add_ps(vacc0, vt); @@ -53,7 +55,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u8( __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scale)); float vout = _mm_cvtss_f32(vacc); *output += vout; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u16-acc2.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u16-acc2.c index 5aafe876ab59..0dccbd0a7937 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u16-acc2.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u16-acc2.c @@ -49,7 +49,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u16_acc2( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u24-acc3.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u24-acc3.c index b0ec191cd1ca..656b4dd09c4d 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u24-acc3.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u24-acc3.c @@ -56,7 +56,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u24_acc3( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc2.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc2.c index 912ebddc7a2e..397654088882 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc2.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc2.c @@ -59,7 +59,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc2( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc4.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc4.c index 43f2b4d6ef53..360680c65069 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc4.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc4.c @@ -63,7 +63,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc4( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u4.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u4.c index e9c37410602a..3c64ebb34dc3 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u4.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u4.c @@ -33,7 +33,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u4( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u8.c b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u8.c index c22fd58600f7..b2ea458c7c1b 100644 --- a/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u8.c +++ b/src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u8.c @@ -33,7 +33,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u8( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/f16-f32acc-rsum/neonfp16arith.c.in b/src/f16-f32acc-rsum/neonfp16arith.c.in index 5acde06b0d0b..e40b6a427f70 100644 --- a/src/f16-f32acc-rsum/neonfp16arith.c.in +++ b/src/f16-f32acc-rsum/neonfp16arith.c.in @@ -55,7 +55,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u${BATCH_TILE}${ACC_SUFFIX}( const float32x4_t vt = vcvt_f32_f16(vh); vacc0 = vaddq_f32(vacc0, vt); } - const float32x2_t vscale = vld1_dup_f32(¶ms->scalar.scale); + const float32x2_t vscale = vld1_dup_f32(¶ms->scale); float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0)); if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) { const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2; diff --git a/src/f16-rsum/avx512fp16.c.in b/src/f16-rsum/avx512fp16.c.in index 921cb1034746..b74fc6870716 100644 --- a/src/f16-rsum/avx512fp16.c.in +++ b/src/f16-rsum/avx512fp16.c.in @@ -74,7 +74,7 @@ void xnn_f16_rsum_ukernel__avx512fp16_u${BATCH_TILE}${ACC_SUFFIX}( vacc = _mm_add_ph(vacc, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vacc)))); vacc = _mm_add_sh(vacc, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vacc), 16))); - const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->fp16arith.scale)); + const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->scale)); vacc = _mm_mul_sh(vacc, vscale); *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vacc), 0); diff --git a/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc2.c b/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc2.c index cc40146d4fec..fe3f04b73f10 100644 --- a/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc2.c +++ b/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc2.c @@ -68,7 +68,7 @@ void xnn_f16_rsum_ukernel__avx512fp16_u128_acc2( vacc = _mm_add_ph(vacc, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vacc)))); vacc = _mm_add_sh(vacc, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vacc), 16))); - const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->fp16arith.scale)); + const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->scale)); vacc = _mm_mul_sh(vacc, vscale); *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vacc), 0); diff --git a/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc4.c b/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc4.c index fda575207364..7ac809ecc143 100644 --- a/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc4.c +++ b/src/f16-rsum/gen/f16-rsum-avx512fp16-u128-acc4.c @@ -72,7 +72,7 @@ void xnn_f16_rsum_ukernel__avx512fp16_u128_acc4( vacc = _mm_add_ph(vacc, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vacc)))); vacc = _mm_add_sh(vacc, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vacc), 16))); - const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->fp16arith.scale)); + const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->scale)); vacc = _mm_mul_sh(vacc, vscale); *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vacc), 0); diff --git a/src/f16-rsum/gen/f16-rsum-avx512fp16-u32.c b/src/f16-rsum/gen/f16-rsum-avx512fp16-u32.c index c885f9c56252..c02d31019f21 100644 --- a/src/f16-rsum/gen/f16-rsum-avx512fp16-u32.c +++ b/src/f16-rsum/gen/f16-rsum-avx512fp16-u32.c @@ -54,7 +54,7 @@ void xnn_f16_rsum_ukernel__avx512fp16_u32( vacc = _mm_add_ph(vacc, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vacc)))); vacc = _mm_add_sh(vacc, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vacc), 16))); - const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->fp16arith.scale)); + const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->scale)); vacc = _mm_mul_sh(vacc, vscale); *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vacc), 0); diff --git a/src/f16-rsum/gen/f16-rsum-avx512fp16-u64-acc2.c b/src/f16-rsum/gen/f16-rsum-avx512fp16-u64-acc2.c index 5e91bc4a92f7..084dc70258a7 100644 --- a/src/f16-rsum/gen/f16-rsum-avx512fp16-u64-acc2.c +++ b/src/f16-rsum/gen/f16-rsum-avx512fp16-u64-acc2.c @@ -64,7 +64,7 @@ void xnn_f16_rsum_ukernel__avx512fp16_u64_acc2( vacc = _mm_add_ph(vacc, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vacc)))); vacc = _mm_add_sh(vacc, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vacc), 16))); - const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->fp16arith.scale)); + const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->scale)); vacc = _mm_mul_sh(vacc, vscale); *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vacc), 0); diff --git a/src/f16-rsum/gen/f16-rsum-avx512fp16-u96-acc3.c b/src/f16-rsum/gen/f16-rsum-avx512fp16-u96-acc3.c index c73415a6a33c..edebc706fd3c 100644 --- a/src/f16-rsum/gen/f16-rsum-avx512fp16-u96-acc3.c +++ b/src/f16-rsum/gen/f16-rsum-avx512fp16-u96-acc3.c @@ -68,7 +68,7 @@ void xnn_f16_rsum_ukernel__avx512fp16_u96_acc3( vacc = _mm_add_ph(vacc, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vacc)))); vacc = _mm_add_sh(vacc, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vacc), 16))); - const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->fp16arith.scale)); + const __m128h vscale = _mm_castsi128_ph(_mm_set1_epi16(params->scale)); vacc = _mm_mul_sh(vacc, vscale); *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vacc), 0); diff --git a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u16-acc2.c b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u16-acc2.c index 63fe4a6fd045..a94df892a8ae 100644 --- a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u16-acc2.c +++ b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u16-acc2.c @@ -42,7 +42,7 @@ void xnn_f16_rsum_ukernel__neonfp16arith_u16_acc2( const float16x8_t vt = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; vacc0 = vaddq_f16(vacc0, vt); } - const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->fp16arith.scale)); + const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->scale)); float16x4_t vacc = vadd_f16(vget_low_f16(vacc0), vget_high_f16(vacc0)); if XNN_UNLIKELY(batch & (4 * sizeof(uint16_t))) { const float16x4_t vt = vreinterpret_f16_u16(vld1_u16(i)); i += 4; diff --git a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u24-acc3.c b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u24-acc3.c index cce907d639fd..fdd1640904e7 100644 --- a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u24-acc3.c +++ b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u24-acc3.c @@ -46,7 +46,7 @@ void xnn_f16_rsum_ukernel__neonfp16arith_u24_acc3( const float16x8_t vt = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; vacc0 = vaddq_f16(vacc0, vt); } - const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->fp16arith.scale)); + const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->scale)); float16x4_t vacc = vadd_f16(vget_low_f16(vacc0), vget_high_f16(vacc0)); if XNN_UNLIKELY(batch & (4 * sizeof(uint16_t))) { const float16x4_t vt = vreinterpret_f16_u16(vld1_u16(i)); i += 4; diff --git a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc2.c b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc2.c index 2ff4e11bd8f6..f87f9d505403 100644 --- a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc2.c +++ b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc2.c @@ -46,7 +46,7 @@ void xnn_f16_rsum_ukernel__neonfp16arith_u32_acc2( const float16x8_t vt = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; vacc0 = vaddq_f16(vacc0, vt); } - const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->fp16arith.scale)); + const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->scale)); float16x4_t vacc = vadd_f16(vget_low_f16(vacc0), vget_high_f16(vacc0)); if XNN_UNLIKELY(batch & (4 * sizeof(uint16_t))) { const float16x4_t vt = vreinterpret_f16_u16(vld1_u16(i)); i += 4; diff --git a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc4.c b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc4.c index cf4540c3df68..beefc30d9dbe 100644 --- a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc4.c +++ b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u32-acc4.c @@ -50,7 +50,7 @@ void xnn_f16_rsum_ukernel__neonfp16arith_u32_acc4( const float16x8_t vt = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; vacc0 = vaddq_f16(vacc0, vt); } - const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->fp16arith.scale)); + const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->scale)); float16x4_t vacc = vadd_f16(vget_low_f16(vacc0), vget_high_f16(vacc0)); if XNN_UNLIKELY(batch & (4 * sizeof(uint16_t))) { const float16x4_t vt = vreinterpret_f16_u16(vld1_u16(i)); i += 4; diff --git a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u8.c b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u8.c index ef331a020ac9..8c74bbd75c67 100644 --- a/src/f16-rsum/gen/f16-rsum-neonfp16arith-u8.c +++ b/src/f16-rsum/gen/f16-rsum-neonfp16arith-u8.c @@ -33,7 +33,7 @@ void xnn_f16_rsum_ukernel__neonfp16arith_u8( const float16x8_t vt = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; vacc0 = vaddq_f16(vacc0, vt); } - const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->fp16arith.scale)); + const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->scale)); float16x4_t vacc = vadd_f16(vget_low_f16(vacc0), vget_high_f16(vacc0)); if XNN_UNLIKELY(batch & (4 * sizeof(uint16_t))) { const float16x4_t vt = vreinterpret_f16_u16(vld1_u16(i)); i += 4; diff --git a/src/f16-rsum/neonfp16arith.c.in b/src/f16-rsum/neonfp16arith.c.in index 6a550b0dc12c..fd51d3ca808d 100644 --- a/src/f16-rsum/neonfp16arith.c.in +++ b/src/f16-rsum/neonfp16arith.c.in @@ -51,7 +51,7 @@ void xnn_f16_rsum_ukernel__neonfp16arith_u${BATCH_TILE}${ACC_SUFFIX}( const float16x8_t vt = vreinterpretq_f16_u16(vld1q_u16(i)); i += 8; vacc0 = vaddq_f16(vacc0, vt); } - const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->fp16arith.scale)); + const float16x4_t vscale = vreinterpret_f16_u16(vld1_dup_u16(¶ms->scale)); float16x4_t vacc = vadd_f16(vget_low_f16(vacc0), vget_high_f16(vacc0)); if XNN_UNLIKELY(batch & (4 * sizeof(uint16_t))) { const float16x4_t vt = vreinterpret_f16_u16(vld1_u16(i)); i += 4; diff --git a/src/f32-rdsum/avx.c.in b/src/f32-rdsum/avx.c.in index 37bd536d02dd..89b3f13db2b4 100644 --- a/src/f32-rdsum/avx.c.in +++ b/src/f32-rdsum/avx.c.in @@ -23,14 +23,16 @@ void xnn_f32_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__avx_c${CHANNELS}( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(rows != 0); assert(channels != 0); assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); - const __m256 vmin = _mm256_set1_ps(params->avx.min); - const __m256 vmax = _mm256_set1_ps(params->avx.max); + const __m256 vscale = _mm256_set1_ps(params->scalar.scale); + const __m256 vmin = _mm256_set1_ps(params->scalar.min); + const __m256 vmax = _mm256_set1_ps(params->scalar.max); size_t input_increment = ${ACCUMULATORS} * input_stride; for (; channels >= ${CHANNELS}; channels -= ${CHANNELS}) { @@ -101,7 +103,7 @@ void xnn_f32_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__avx_c${CHANNELS}( } if (remainder) { - vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - (channels & 0x7) * sizeof(float))); + vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - (channels & 0x7) * sizeof(float))); $for c in range(ACCUMULATORS): vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i${c}[num_full_chunks*8], vmask), vacc[num_full_chunks]); } diff --git a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c index c9ee367eb4a4..4cb3f495e498 100644 --- a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c +++ b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c @@ -25,14 +25,16 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c16( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(rows != 0); assert(channels != 0); assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); - const __m256 vmin = _mm256_set1_ps(params->avx.min); - const __m256 vmax = _mm256_set1_ps(params->avx.max); + const __m256 vscale = _mm256_set1_ps(params->scalar.scale); + const __m256 vmin = _mm256_set1_ps(params->scalar.min); + const __m256 vmax = _mm256_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 16; channels -= 16) { @@ -168,7 +170,7 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c16( } if (remainder) { - vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - (channels & 0x7) * sizeof(float))); + vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - (channels & 0x7) * sizeof(float))); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i0[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i1[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i2[num_full_chunks*8], vmask), vacc[num_full_chunks]); diff --git a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c32.c b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c32.c index 183fd8e3ecb3..0d7ca0f413d7 100644 --- a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c32.c +++ b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c32.c @@ -25,14 +25,16 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c32( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(rows != 0); assert(channels != 0); assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); - const __m256 vmin = _mm256_set1_ps(params->avx.min); - const __m256 vmax = _mm256_set1_ps(params->avx.max); + const __m256 vscale = _mm256_set1_ps(params->scalar.scale); + const __m256 vmin = _mm256_set1_ps(params->scalar.min); + const __m256 vmax = _mm256_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 32; channels -= 32) { @@ -214,7 +216,7 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c32( } if (remainder) { - vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - (channels & 0x7) * sizeof(float))); + vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - (channels & 0x7) * sizeof(float))); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i0[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i1[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i2[num_full_chunks*8], vmask), vacc[num_full_chunks]); diff --git a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c index 6d1b3c66a551..506803acbc1d 100644 --- a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c +++ b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c @@ -25,14 +25,16 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c64( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(rows != 0); assert(channels != 0); assert(input != NULL); assert(output != NULL); - const __m256 vscale = _mm256_set1_ps(params->avx.scale); - const __m256 vmin = _mm256_set1_ps(params->avx.min); - const __m256 vmax = _mm256_set1_ps(params->avx.max); + const __m256 vscale = _mm256_set1_ps(params->scalar.scale); + const __m256 vmin = _mm256_set1_ps(params->scalar.min); + const __m256 vmax = _mm256_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 64; channels -= 64) { @@ -306,7 +308,7 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c64( } if (remainder) { - vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - (channels & 0x7) * sizeof(float))); + vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - (channels & 0x7) * sizeof(float))); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i0[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i1[num_full_chunks*8], vmask), vacc[num_full_chunks]); vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i2[num_full_chunks*8], vmask), vacc[num_full_chunks]); diff --git a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c16.c b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c16.c index a9db2d14e800..d019dec4ad89 100644 --- a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c16.c +++ b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c16.c @@ -30,9 +30,9 @@ void xnn_f32_rdsum_ukernel_7p7x__sse_c16( assert(input != NULL); assert(output != NULL); - const __m128 vscale = _mm_load_ps(params->sse.scale); - const __m128 vmin = _mm_load_ps(params->sse.min); - const __m128 vmax = _mm_load_ps(params->sse.max); + const __m128 vscale = _mm_set1_ps(params->scalar.scale); + const __m128 vmin = _mm_set1_ps(params->scalar.min); + const __m128 vmax = _mm_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 16; channels -= 16) { diff --git a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c index 2a008764833c..814e6ca5aef0 100644 --- a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c +++ b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c @@ -30,9 +30,9 @@ void xnn_f32_rdsum_ukernel_7p7x__sse_c32( assert(input != NULL); assert(output != NULL); - const __m128 vscale = _mm_load_ps(params->sse.scale); - const __m128 vmin = _mm_load_ps(params->sse.min); - const __m128 vmax = _mm_load_ps(params->sse.max); + const __m128 vscale = _mm_set1_ps(params->scalar.scale); + const __m128 vmin = _mm_set1_ps(params->scalar.min); + const __m128 vmax = _mm_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 32; channels -= 32) { diff --git a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c index c407e363cbaf..5091ea7d74cc 100644 --- a/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c +++ b/src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c @@ -30,9 +30,9 @@ void xnn_f32_rdsum_ukernel_7p7x__sse_c64( assert(input != NULL); assert(output != NULL); - const __m128 vscale = _mm_load_ps(params->sse.scale); - const __m128 vmin = _mm_load_ps(params->sse.min); - const __m128 vmax = _mm_load_ps(params->sse.max); + const __m128 vscale = _mm_set1_ps(params->scalar.scale); + const __m128 vmin = _mm_set1_ps(params->scalar.min); + const __m128 vmax = _mm_set1_ps(params->scalar.max); size_t input_increment = 7 * input_stride; for (; channels >= 64; channels -= 64) { diff --git a/src/f32-rdsum/sse.c.in b/src/f32-rdsum/sse.c.in index f4addfed9c58..453c4fdbb8d3 100644 --- a/src/f32-rdsum/sse.c.in +++ b/src/f32-rdsum/sse.c.in @@ -28,9 +28,9 @@ void xnn_f32_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__sse_c${CHANNELS}( assert(input != NULL); assert(output != NULL); - const __m128 vscale = _mm_load_ps(params->sse.scale); - const __m128 vmin = _mm_load_ps(params->sse.min); - const __m128 vmax = _mm_load_ps(params->sse.max); + const __m128 vscale = _mm_set1_ps(params->scalar.scale); + const __m128 vmin = _mm_set1_ps(params->scalar.min); + const __m128 vmax = _mm_set1_ps(params->scalar.max); size_t input_increment = ${ACCUMULATORS} * input_stride; for (; channels >= ${CHANNELS}; channels -= ${CHANNELS}) { diff --git a/src/f32-rsum/avx.c.in b/src/f32-rsum/avx.c.in index 78725b72102b..c79f3ee74766 100644 --- a/src/f32-rsum/avx.c.in +++ b/src/f32-rsum/avx.c.in @@ -23,6 +23,8 @@ void xnn_f32_rsum_ukernel__avx_u${BATCH_TILE}${ACC_SUFFIX}( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -56,15 +58,15 @@ void xnn_f32_rsum_ukernel__avx_u${BATCH_TILE}${ACC_SUFFIX}( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vt = _mm256_maskload_ps(input, vmask); vacc0 = _mm256_add_ps(vacc0, vt); } __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); - vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->avx.min)); - vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->avx.max)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->scalar.min)); + vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->scalar.max)); *output += _mm_cvtss_f32(vacc); } diff --git a/src/f32-rsum/gen/f32-rsum-avx-u16-acc2.c b/src/f32-rsum/gen/f32-rsum-avx-u16-acc2.c index af4e114abf16..1ee7e06e827d 100644 --- a/src/f32-rsum/gen/f32-rsum-avx-u16-acc2.c +++ b/src/f32-rsum/gen/f32-rsum-avx-u16-acc2.c @@ -21,6 +21,8 @@ void xnn_f32_rsum_ukernel__avx_u16_acc2( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -46,15 +48,15 @@ void xnn_f32_rsum_ukernel__avx_u16_acc2( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vt = _mm256_maskload_ps(input, vmask); vacc0 = _mm256_add_ps(vacc0, vt); } __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); - vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->avx.min)); - vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->avx.max)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->scalar.min)); + vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->scalar.max)); *output += _mm_cvtss_f32(vacc); } diff --git a/src/f32-rsum/gen/f32-rsum-avx-u24-acc3.c b/src/f32-rsum/gen/f32-rsum-avx-u24-acc3.c index d16599a00fb3..340ad23cccc8 100644 --- a/src/f32-rsum/gen/f32-rsum-avx-u24-acc3.c +++ b/src/f32-rsum/gen/f32-rsum-avx-u24-acc3.c @@ -21,6 +21,8 @@ void xnn_f32_rsum_ukernel__avx_u24_acc3( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -50,15 +52,15 @@ void xnn_f32_rsum_ukernel__avx_u24_acc3( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vt = _mm256_maskload_ps(input, vmask); vacc0 = _mm256_add_ps(vacc0, vt); } __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); - vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->avx.min)); - vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->avx.max)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->scalar.min)); + vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->scalar.max)); *output += _mm_cvtss_f32(vacc); } diff --git a/src/f32-rsum/gen/f32-rsum-avx-u32-acc2.c b/src/f32-rsum/gen/f32-rsum-avx-u32-acc2.c index 9a4bca86d765..27882dc24ea4 100644 --- a/src/f32-rsum/gen/f32-rsum-avx-u32-acc2.c +++ b/src/f32-rsum/gen/f32-rsum-avx-u32-acc2.c @@ -21,6 +21,8 @@ void xnn_f32_rsum_ukernel__avx_u32_acc2( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -50,15 +52,15 @@ void xnn_f32_rsum_ukernel__avx_u32_acc2( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vt = _mm256_maskload_ps(input, vmask); vacc0 = _mm256_add_ps(vacc0, vt); } __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); - vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->avx.min)); - vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->avx.max)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->scalar.min)); + vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->scalar.max)); *output += _mm_cvtss_f32(vacc); } diff --git a/src/f32-rsum/gen/f32-rsum-avx-u32-acc4.c b/src/f32-rsum/gen/f32-rsum-avx-u32-acc4.c index 0ea1cffb4da2..64552404ef44 100644 --- a/src/f32-rsum/gen/f32-rsum-avx-u32-acc4.c +++ b/src/f32-rsum/gen/f32-rsum-avx-u32-acc4.c @@ -21,6 +21,8 @@ void xnn_f32_rsum_ukernel__avx_u32_acc4( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -54,15 +56,15 @@ void xnn_f32_rsum_ukernel__avx_u32_acc4( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vt = _mm256_maskload_ps(input, vmask); vacc0 = _mm256_add_ps(vacc0, vt); } __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); - vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->avx.min)); - vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->avx.max)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->scalar.min)); + vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->scalar.max)); *output += _mm_cvtss_f32(vacc); } diff --git a/src/f32-rsum/gen/f32-rsum-avx-u8.c b/src/f32-rsum/gen/f32-rsum-avx-u8.c index fb4bb1ca4bc5..cd206472cb0c 100644 --- a/src/f32-rsum/gen/f32-rsum-avx-u8.c +++ b/src/f32-rsum/gen/f32-rsum-avx-u8.c @@ -21,6 +21,8 @@ void xnn_f32_rsum_ukernel__avx_u8( float* output, const union xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) { + static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0}; + assert(batch != 0); assert(batch % sizeof(float) == 0); assert(input != NULL); @@ -36,15 +38,15 @@ void xnn_f32_rsum_ukernel__avx_u8( if XNN_UNLIKELY(batch != 0) { assert(batch >= 1 * sizeof(float)); assert(batch <= 7 * sizeof(float)); - const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - batch)); + const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch)); const __m256 vt = _mm256_maskload_ps(input, vmask); vacc0 = _mm256_add_ps(vacc0, vt); } __m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1)); vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc)); vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc)); - vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->avx.scale)); - vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->avx.min)); - vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->avx.max)); + vacc = _mm_mul_ss(vacc, _mm_load_ss(¶ms->scalar.scale)); + vacc = _mm_max_ss(vacc, _mm_load_ss(¶ms->scalar.min)); + vacc = _mm_min_ss(vacc, _mm_load_ss(¶ms->scalar.max)); *output += _mm_cvtss_f32(vacc); } diff --git a/src/f32-rsum/gen/f32-rsum-hvx-u128-acc2.c b/src/f32-rsum/gen/f32-rsum-hvx-u128-acc2.c index 6da15b47a9b8..8a682116f1e1 100644 --- a/src/f32-rsum/gen/f32-rsum-hvx-u128-acc2.c +++ b/src/f32-rsum/gen/f32-rsum-hvx-u128-acc2.c @@ -65,7 +65,7 @@ void xnn_f32_rsum_ukernel__hvx_u128_acc2( float result = *((float*) &vacc0); - const float vscale = params->scalar.scale; + const float vscale = params->scale; result = result * vscale; result = math_max_f32(result, params->scalar.min); result = math_min_f32(result, params->scalar.max); diff --git a/src/f32-rsum/gen/f32-rsum-hvx-u128-acc4.c b/src/f32-rsum/gen/f32-rsum-hvx-u128-acc4.c index 8a1064d42d5a..e10998156185 100644 --- a/src/f32-rsum/gen/f32-rsum-hvx-u128-acc4.c +++ b/src/f32-rsum/gen/f32-rsum-hvx-u128-acc4.c @@ -69,7 +69,7 @@ void xnn_f32_rsum_ukernel__hvx_u128_acc4( float result = *((float*) &vacc0); - const float vscale = params->scalar.scale; + const float vscale = params->scale; result = result * vscale; result = math_max_f32(result, params->scalar.min); result = math_min_f32(result, params->scalar.max); diff --git a/src/f32-rsum/gen/f32-rsum-hvx-u32.c b/src/f32-rsum/gen/f32-rsum-hvx-u32.c index f2474abee169..af97a4465049 100644 --- a/src/f32-rsum/gen/f32-rsum-hvx-u32.c +++ b/src/f32-rsum/gen/f32-rsum-hvx-u32.c @@ -51,7 +51,7 @@ void xnn_f32_rsum_ukernel__hvx_u32( float result = *((float*) &vacc0); - const float vscale = params->scalar.scale; + const float vscale = params->scale; result = result * vscale; result = math_max_f32(result, params->scalar.min); result = math_min_f32(result, params->scalar.max); diff --git a/src/f32-rsum/gen/f32-rsum-hvx-u64-acc2.c b/src/f32-rsum/gen/f32-rsum-hvx-u64-acc2.c index e170befa87ef..7c28ec6f3f82 100644 --- a/src/f32-rsum/gen/f32-rsum-hvx-u64-acc2.c +++ b/src/f32-rsum/gen/f32-rsum-hvx-u64-acc2.c @@ -61,7 +61,7 @@ void xnn_f32_rsum_ukernel__hvx_u64_acc2( float result = *((float*) &vacc0); - const float vscale = params->scalar.scale; + const float vscale = params->scale; result = result * vscale; result = math_max_f32(result, params->scalar.min); result = math_min_f32(result, params->scalar.max); diff --git a/src/f32-rsum/gen/f32-rsum-hvx-u96-acc3.c b/src/f32-rsum/gen/f32-rsum-hvx-u96-acc3.c index 78abbe0abc42..64b76ee5aa53 100644 --- a/src/f32-rsum/gen/f32-rsum-hvx-u96-acc3.c +++ b/src/f32-rsum/gen/f32-rsum-hvx-u96-acc3.c @@ -65,7 +65,7 @@ void xnn_f32_rsum_ukernel__hvx_u96_acc3( float result = *((float*) &vacc0); - const float vscale = params->scalar.scale; + const float vscale = params->scale; result = result * vscale; result = math_max_f32(result, params->scalar.min); result = math_min_f32(result, params->scalar.max); diff --git a/src/f32-rsum/hvx.c.in b/src/f32-rsum/hvx.c.in index b81446243588..6012d505635e 100644 --- a/src/f32-rsum/hvx.c.in +++ b/src/f32-rsum/hvx.c.in @@ -70,7 +70,7 @@ void xnn_f32_rsum_ukernel__hvx_u${BATCH_TILE}${ACC_SUFFIX}( float result = *((float*) &vacc0); - const float vscale = params->scalar.scale; + const float vscale = params->scale; result = result * vscale; result = math_max_f32(result, params->scalar.min); result = math_min_f32(result, params->scalar.max); diff --git a/src/microparams-init.c b/src/microparams-init.c index 3115092c717f..3177e89e8db2 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -1915,70 +1915,26 @@ size_t xnn_init_f16_scale_fp16arith_params( union xnn_f16_scale_params params[XNN_MIN_ELEMENTS(1)], uint16_t scale) { - params->fp16arith.scale = scale; - return sizeof(params->fp16arith); + params->scale = scale; + return sizeof(params[0]); } size_t xnn_init_f16_f32acc_scale_scalar_params( union xnn_f16_f32acc_scale_params params[XNN_MIN_ELEMENTS(1)], float scale) { - params->scalar.scale = scale; - return sizeof(params->scalar); -} - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f16_f32acc_scale_avx_params( - union xnn_f16_f32acc_scale_params params[XNN_MIN_ELEMENTS(1)], - float scale) -{ - for (uint32_t i = 0; i < 7; i++) { - params->avx.mask_table[i] = -1; - } - for (uint32_t i = 7; i < 14; i++) { - params->avx.mask_table[i] = 0; - } - params->avx.scale = scale; - return sizeof(params->avx); + params->scale = scale; + return sizeof(params[0]); } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 size_t xnn_init_f32_scale_scalar_params( union xnn_f32_scale_params params[XNN_MIN_ELEMENTS(1)], float scale) { - params->scalar.scale = scale; - return sizeof(params->scalar); + params->scale = scale; + return sizeof(params[0]); } -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f32_scale_sse_params( - union xnn_f32_scale_params params[XNN_MIN_ELEMENTS(1)], - float scale) -{ - for (uint32_t i = 0; i < 4; i++) { - params->sse.scale[i] = scale; - } - return sizeof(params->sse); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f32_scale_avx_params( - union xnn_f32_scale_params params[XNN_MIN_ELEMENTS(1)], - float scale) -{ - for (uint32_t i = 0; i < 7; i++) { - params->avx.mask_table[i] = -1; - } - for (uint32_t i = 7; i < 14; i++) { - params->avx.mask_table[i] = 0; - } - params->avx.scale = scale; - return sizeof(params->avx); -} -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - #if XNN_ARCH_X86 || XNN_ARCH_X86_64 size_t xnn_init_f32_scaleminmax_avx_params( union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index 09f9aedeeb29..d1007a2df024 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -287,9 +287,6 @@ DECLARE_UPDATE_QU8_AVGPOOL_PARAMS_FUNCTION(xnn_update_qu8_avgpool_minmax_fp32_sc float scale); DECLARE_INIT_F16_F32ACC_SCALE_PARAMS_FUNCTION(xnn_init_f16_f32acc_scale_scalar_params) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F16_F32ACC_SCALE_PARAMS_FUNCTION(xnn_init_f16_f32acc_scale_avx_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #define DECLARE_INIT_F32_SCALE_PARAMS_FUNCTION(fn_name) \ @@ -298,10 +295,6 @@ DECLARE_INIT_F16_F32ACC_SCALE_PARAMS_FUNCTION(xnn_init_f16_f32acc_scale_scalar_p float scale); DECLARE_INIT_F32_SCALE_PARAMS_FUNCTION(xnn_init_f32_scale_scalar_params) -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F32_SCALE_PARAMS_FUNCTION(xnn_init_f32_scale_sse_params) - DECLARE_INIT_F32_SCALE_PARAMS_FUNCTION(xnn_init_f32_scale_avx_params) -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 #define DECLARE_INIT_F16_SCALEMINMAX_PARAMS_FUNCTION(fn_name) \ diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index cddc76f8da7c..241e6b0db7ea 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -45,37 +45,21 @@ union xnn_f32_relu_params { // Scale: used by RSUM microkernels union xnn_f16_scale_params { - char _; // Dummy member variable to comply with the C standard struct { uint16_t scale; - } fp16arith; + }; }; union xnn_f16_f32acc_scale_params { struct { float scale; - } scalar; -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - int16_t mask_table[14]; - float scale; - } avx; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + }; }; union xnn_f32_scale_params { struct { float scale; - } scalar; -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) float scale[4]; - } sse; - struct { - int32_t mask_table[14]; - float scale; - } avx; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 + }; }; diff --git a/test/f16-f32acc-rdsum.cc b/test/f16-f32acc-rdsum.cc index ae7b0f40a350..92209944cd3b 100644 --- a/test/f16-f32acc-rdsum.cc +++ b/test/f16-f32acc-rdsum.cc @@ -661,7 +661,7 @@ RDSumMicrokernelTester() .rows(14) .channels(16) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C16, channels_eq_16_2pass_fulltile_with_input_stride) { @@ -670,7 +670,7 @@ .rows(14) .channels(16) .input_stride(19) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C16, channels_eq_16_2pass_subtile) { @@ -679,7 +679,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(16) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -690,7 +690,7 @@ .rows(rows) .channels(16) .input_stride(19) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -700,7 +700,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(16) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -711,7 +711,7 @@ .rows(rows) .channels(16) .input_stride(19) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -721,7 +721,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -732,7 +732,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -744,7 +744,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -757,7 +757,7 @@ .rows(rows) .channels(channels) .input_stride(263) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -768,7 +768,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -779,7 +779,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -791,7 +791,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -804,7 +804,7 @@ .rows(rows) .channels(channels) .input_stride(19) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -815,7 +815,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -826,7 +826,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -838,7 +838,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -851,7 +851,7 @@ .rows(rows) .channels(channels) .input_stride(47) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -862,7 +862,7 @@ RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16, xnn_init_f16_f32acc_scale_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -874,7 +874,7 @@ RDSumMicrokernelTester() .rows(14) .channels(32) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C32, channels_eq_32_2pass_fulltile_with_input_stride) { @@ -883,7 +883,7 @@ .rows(14) .channels(32) .input_stride(37) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C32, channels_eq_32_2pass_subtile) { @@ -892,7 +892,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(32) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -903,7 +903,7 @@ .rows(rows) .channels(32) .input_stride(37) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -913,7 +913,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(32) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -924,7 +924,7 @@ .rows(rows) .channels(32) .input_stride(37) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -934,7 +934,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -945,7 +945,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -957,7 +957,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -970,7 +970,7 @@ .rows(rows) .channels(channels) .input_stride(521) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -981,7 +981,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -992,7 +992,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1004,7 +1004,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1017,7 +1017,7 @@ .rows(rows) .channels(channels) .input_stride(37) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1028,7 +1028,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1039,7 +1039,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1051,7 +1051,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1064,7 +1064,7 @@ .rows(rows) .channels(channels) .input_stride(79) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1075,7 +1075,7 @@ RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32, xnn_init_f16_f32acc_scale_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1087,7 +1087,7 @@ RDSumMicrokernelTester() .rows(14) .channels(64) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C64, channels_eq_64_2pass_fulltile_with_input_stride) { @@ -1096,7 +1096,7 @@ .rows(14) .channels(64) .input_stride(67) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C64, channels_eq_64_2pass_subtile) { @@ -1105,7 +1105,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(64) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1116,7 +1116,7 @@ .rows(rows) .channels(64) .input_stride(67) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1126,7 +1126,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(64) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1137,7 +1137,7 @@ .rows(rows) .channels(64) .input_stride(67) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1147,7 +1147,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1158,7 +1158,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1170,7 +1170,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1183,7 +1183,7 @@ .rows(rows) .channels(channels) .input_stride(1031) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1194,7 +1194,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1205,7 +1205,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1217,7 +1217,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1230,7 +1230,7 @@ .rows(rows) .channels(channels) .input_stride(67) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1241,7 +1241,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1252,7 +1252,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1264,7 +1264,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1277,7 +1277,7 @@ .rows(rows) .channels(channels) .input_stride(149) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1288,7 +1288,7 @@ RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64, xnn_init_f16_f32acc_scale_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1300,7 +1300,7 @@ RDSumMicrokernelTester() .rows(14) .channels(128) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C128, channels_eq_128_2pass_fulltile_with_input_stride) { @@ -1309,7 +1309,7 @@ .rows(14) .channels(128) .input_stride(131) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RDSUM_7P7X__F16C_C128, channels_eq_128_2pass_subtile) { @@ -1318,7 +1318,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(128) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1329,7 +1329,7 @@ .rows(rows) .channels(128) .input_stride(131) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1339,7 +1339,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(128) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1350,7 +1350,7 @@ .rows(rows) .channels(128) .input_stride(131) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1360,7 +1360,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1371,7 +1371,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1383,7 +1383,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1396,7 +1396,7 @@ .rows(rows) .channels(channels) .input_stride(2053) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1407,7 +1407,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1418,7 +1418,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1430,7 +1430,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1443,7 +1443,7 @@ .rows(rows) .channels(channels) .input_stride(131) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1454,7 +1454,7 @@ RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -1465,7 +1465,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1477,7 +1477,7 @@ RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1490,7 +1490,7 @@ .rows(rows) .channels(channels) .input_stride(269) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } } @@ -1501,7 +1501,7 @@ RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128, xnn_init_f16_f32acc_scale_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f16-f32acc-rdsum.yaml b/test/f16-f32acc-rdsum.yaml index 574269c4a8f0..c7dbe3365fdf 100644 --- a/test/f16-f32acc-rdsum.yaml +++ b/test/f16-f32acc-rdsum.yaml @@ -13,13 +13,13 @@ # x86 AVX - name: xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params - name: xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params - name: xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params - name: xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params # x86 AVX512SKX - name: xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c16 diff --git a/test/f16-f32acc-rsum.cc b/test/f16-f32acc-rsum.cc index a32e713c514f..cbd1e76110c2 100644 --- a/test/f16-f32acc-rsum.cc +++ b/test/f16-f32acc-rsum.cc @@ -345,7 +345,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(8) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RSUM__F16C_U8, batch_div_8) { @@ -353,7 +353,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -362,7 +362,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -371,7 +371,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -381,7 +381,7 @@ RSumMicrokernelTester() .batch_size(9) .scale(scale) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -389,7 +389,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(1024) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u8, xnn_init_f16_f32acc_scale_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -399,7 +399,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(16) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RSUM__F16C_U16_ACC2, batch_div_16) { @@ -407,7 +407,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -416,7 +416,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -425,7 +425,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -435,7 +435,7 @@ RSumMicrokernelTester() .batch_size(17) .scale(scale) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -443,7 +443,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(2048) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2, xnn_init_f16_f32acc_scale_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -453,7 +453,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(24) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RSUM__F16C_U24_ACC3, batch_div_24) { @@ -461,7 +461,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -470,7 +470,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -479,7 +479,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -489,7 +489,7 @@ RSumMicrokernelTester() .batch_size(25) .scale(scale) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -497,7 +497,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(3072) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3, xnn_init_f16_f32acc_scale_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -507,7 +507,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RSUM__F16C_U32_ACC2, batch_div_32) { @@ -515,7 +515,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -524,7 +524,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -533,7 +533,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -543,7 +543,7 @@ RSumMicrokernelTester() .batch_size(33) .scale(scale) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -551,7 +551,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(4096) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2, xnn_init_f16_f32acc_scale_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -561,7 +561,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(32) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); } TEST(F16_F32ACC_RSUM__F16C_U32_ACC4, batch_div_32) { @@ -569,7 +569,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -578,7 +578,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -587,7 +587,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -597,7 +597,7 @@ RSumMicrokernelTester() .batch_size(33) .scale(scale) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); } } @@ -605,7 +605,7 @@ TEST_REQUIRES_X86_F16C; RSumMicrokernelTester() .batch_size(4096) - .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_avx_params); + .Test(xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4, xnn_init_f16_f32acc_scale_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f16-f32acc-rsum.yaml b/test/f16-f32acc-rsum.yaml index b47bd776cdbb..2b6e7583d89b 100644 --- a/test/f16-f32acc-rsum.yaml +++ b/test/f16-f32acc-rsum.yaml @@ -19,15 +19,15 @@ # x86 F16C - name: xnn_f16_f32acc_rsum_ukernel__f16c_u8 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params - name: xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params - name: xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params - name: xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params - name: xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4 - init: xnn_init_f16_f32acc_scale_avx_params + init: xnn_init_f16_f32acc_scale_scalar_params # x86 AVX512SKX - name: xnn_f16_f32acc_rsum_ukernel__avx512skx_u16 diff --git a/test/f32-rdsum.cc b/test/f32-rdsum.cc index d4bb4fcac0eb..625ca06ceb34 100644 --- a/test/f32-rdsum.cc +++ b/test/f32-rdsum.cc @@ -852,7 +852,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(16) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__SSE_C16, channels_eq_16_2pass_fulltile_with_input_stride) { @@ -861,7 +861,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(14) .channels(16) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__SSE_C16, channels_eq_16_2pass_subtile) { @@ -870,7 +870,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(16) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -881,7 +881,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(16) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -891,7 +891,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(16) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -902,7 +902,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(16) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -912,7 +912,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -923,7 +923,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -935,7 +935,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -948,7 +948,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(263) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -959,7 +959,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -970,7 +970,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -982,7 +982,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -995,7 +995,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1006,7 +1006,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1017,7 +1017,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1029,7 +1029,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1042,7 +1042,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(47) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1053,7 +1053,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c16, xnn_init_f32_scaleminmax_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1065,7 +1065,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(32) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__SSE_C32, channels_eq_32_2pass_fulltile_with_input_stride) { @@ -1074,7 +1074,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(14) .channels(32) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__SSE_C32, channels_eq_32_2pass_subtile) { @@ -1083,7 +1083,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(32) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1094,7 +1094,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(32) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1104,7 +1104,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(32) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1115,7 +1115,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(32) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1125,7 +1125,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1136,7 +1136,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1148,7 +1148,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1161,7 +1161,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(521) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1172,7 +1172,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1183,7 +1183,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1195,7 +1195,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1208,7 +1208,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1219,7 +1219,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1230,7 +1230,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1242,7 +1242,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1255,7 +1255,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(79) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1266,7 +1266,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c32, xnn_init_f32_scaleminmax_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1278,7 +1278,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(64) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__SSE_C64, channels_eq_64_2pass_fulltile_with_input_stride) { @@ -1287,7 +1287,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(14) .channels(64) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__SSE_C64, channels_eq_64_2pass_subtile) { @@ -1296,7 +1296,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(64) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1307,7 +1307,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(64) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1317,7 +1317,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(64) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1328,7 +1328,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(64) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1338,7 +1338,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1349,7 +1349,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1361,7 +1361,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1374,7 +1374,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(1031) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1385,7 +1385,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1396,7 +1396,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1408,7 +1408,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1421,7 +1421,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1432,7 +1432,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1443,7 +1443,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1455,7 +1455,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1468,7 +1468,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(149) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1479,7 +1479,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_sse_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__sse_c64, xnn_init_f32_scaleminmax_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1491,7 +1491,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(16) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__AVX_C16, channels_eq_16_2pass_fulltile_with_input_stride) { @@ -1500,7 +1500,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(14) .channels(16) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__AVX_C16, channels_eq_16_2pass_subtile) { @@ -1509,7 +1509,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(16) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1520,7 +1520,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(16) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1530,7 +1530,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(16) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1541,7 +1541,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(16) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1551,7 +1551,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1562,7 +1562,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1574,7 +1574,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1587,7 +1587,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(263) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1598,7 +1598,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1609,7 +1609,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1621,7 +1621,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1634,7 +1634,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(19) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1645,7 +1645,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1656,7 +1656,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1668,7 +1668,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1681,7 +1681,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(47) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1692,7 +1692,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c16, xnn_init_f32_scaleminmax_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1704,7 +1704,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(32) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__AVX_C32, channels_eq_32_2pass_fulltile_with_input_stride) { @@ -1713,7 +1713,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(14) .channels(32) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__AVX_C32, channels_eq_32_2pass_subtile) { @@ -1722,7 +1722,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(32) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1733,7 +1733,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(32) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1743,7 +1743,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(32) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1754,7 +1754,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(32) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1764,7 +1764,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1775,7 +1775,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1787,7 +1787,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1800,7 +1800,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(521) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1811,7 +1811,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1822,7 +1822,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1834,7 +1834,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1847,7 +1847,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(37) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1858,7 +1858,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1869,7 +1869,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1881,7 +1881,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1894,7 +1894,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(79) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -1905,7 +1905,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c32, xnn_init_f32_scaleminmax_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -1917,7 +1917,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(64) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__AVX_C64, channels_eq_64_2pass_fulltile_with_input_stride) { @@ -1926,7 +1926,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(14) .channels(64) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RDSUM_7P7X__AVX_C64, channels_eq_64_2pass_subtile) { @@ -1935,7 +1935,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(64) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1946,7 +1946,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(64) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1956,7 +1956,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(64) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1967,7 +1967,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(64) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1977,7 +1977,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -1988,7 +1988,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2000,7 +2000,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2013,7 +2013,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(1031) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2024,7 +2024,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -2035,7 +2035,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2047,7 +2047,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2060,7 +2060,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(67) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2071,7 +2071,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(14) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } @@ -2082,7 +2082,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2094,7 +2094,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(rows) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2107,7 +2107,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { .rows(rows) .channels(channels) .input_stride(149) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } } @@ -2118,7 +2118,7 @@ TEST(F32_RDSUM_7P7X__SCALAR_C4, overflow_accumulator) { RDSumMicrokernelTester() .rows(264) .channels(channels) - .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rdsum_ukernel_7p7x__avx_c64, xnn_init_f32_scaleminmax_scalar_params); } } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f32-rdsum.yaml b/test/f32-rdsum.yaml index d11a4c682c75..a0664dc11d0b 100644 --- a/test/f32-rdsum.yaml +++ b/test/f32-rdsum.yaml @@ -15,18 +15,18 @@ init: xnn_init_f32_scaleminmax_scalar_params # x86 SSE - name: xnn_f32_rdsum_ukernel_7p7x__sse_c16 - init: xnn_init_f32_scaleminmax_sse_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rdsum_ukernel_7p7x__sse_c32 - init: xnn_init_f32_scaleminmax_sse_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rdsum_ukernel_7p7x__sse_c64 - init: xnn_init_f32_scaleminmax_sse_params + init: xnn_init_f32_scaleminmax_scalar_params # x86 AVX - name: xnn_f32_rdsum_ukernel_7p7x__avx_c16 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rdsum_ukernel_7p7x__avx_c32 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rdsum_ukernel_7p7x__avx_c64 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params # x86 AVX512F - name: xnn_f32_rdsum_ukernel_7p7x__avx512f_c16 init: xnn_init_f32_scaleminmax_scalar_params diff --git a/test/f32-rsum.cc b/test/f32-rsum.cc index c150b9b121d2..912fbb6558f7 100644 --- a/test/f32-rsum.cc +++ b/test/f32-rsum.cc @@ -561,7 +561,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(8) - .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RSUM__AVX_U8, batch_div_8) { @@ -569,7 +569,7 @@ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_scalar_params); } } @@ -578,7 +578,7 @@ for (size_t batch_size = 1; batch_size < 8; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_scalar_params); } } @@ -587,7 +587,7 @@ for (size_t batch_size = 9; batch_size < 16; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_scalar_params); } } @@ -597,7 +597,7 @@ RSumMicrokernelTester() .batch_size(9) .scale(scale) - .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_scalar_params); } } @@ -605,7 +605,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(1024) - .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u8, xnn_init_f32_scaleminmax_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -615,7 +615,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(16) - .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RSUM__AVX_U16_ACC2, batch_div_16) { @@ -623,7 +623,7 @@ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -632,7 +632,7 @@ for (size_t batch_size = 1; batch_size < 16; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -641,7 +641,7 @@ for (size_t batch_size = 17; batch_size < 32; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -651,7 +651,7 @@ RSumMicrokernelTester() .batch_size(17) .scale(scale) - .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -659,7 +659,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(2048) - .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u16_acc2, xnn_init_f32_scaleminmax_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -669,7 +669,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(24) - .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RSUM__AVX_U24_ACC3, batch_div_24) { @@ -677,7 +677,7 @@ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_scalar_params); } } @@ -686,7 +686,7 @@ for (size_t batch_size = 1; batch_size < 24; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_scalar_params); } } @@ -695,7 +695,7 @@ for (size_t batch_size = 25; batch_size < 48; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_scalar_params); } } @@ -705,7 +705,7 @@ RSumMicrokernelTester() .batch_size(25) .scale(scale) - .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_scalar_params); } } @@ -713,7 +713,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(3072) - .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u24_acc3, xnn_init_f32_scaleminmax_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -723,7 +723,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(32) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RSUM__AVX_U32_ACC2, batch_div_32) { @@ -731,7 +731,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -740,7 +740,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -749,7 +749,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -759,7 +759,7 @@ RSumMicrokernelTester() .batch_size(33) .scale(scale) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_scalar_params); } } @@ -767,7 +767,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(4096) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc2, xnn_init_f32_scaleminmax_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 @@ -777,7 +777,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(32) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_scalar_params); } TEST(F32_RSUM__AVX_U32_ACC4, batch_div_32) { @@ -785,7 +785,7 @@ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_scalar_params); } } @@ -794,7 +794,7 @@ for (size_t batch_size = 1; batch_size < 32; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_scalar_params); } } @@ -803,7 +803,7 @@ for (size_t batch_size = 33; batch_size < 64; batch_size++) { RSumMicrokernelTester() .batch_size(batch_size) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_scalar_params); } } @@ -813,7 +813,7 @@ RSumMicrokernelTester() .batch_size(33) .scale(scale) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_scalar_params); } } @@ -821,7 +821,7 @@ TEST_REQUIRES_X86_AVX; RSumMicrokernelTester() .batch_size(4096) - .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_avx_params); + .Test(xnn_f32_rsum_ukernel__avx_u32_acc4, xnn_init_f32_scaleminmax_scalar_params); } #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f32-rsum.yaml b/test/f32-rsum.yaml index ded1b0cb638d..cd3bb2c6b0ac 100644 --- a/test/f32-rsum.yaml +++ b/test/f32-rsum.yaml @@ -29,15 +29,15 @@ # x86 AVX - name: xnn_f32_rsum_ukernel__avx_u8 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rsum_ukernel__avx_u16_acc2 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rsum_ukernel__avx_u24_acc3 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rsum_ukernel__avx_u32_acc2 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params - name: xnn_f32_rsum_ukernel__avx_u32_acc4 - init: xnn_init_f32_scaleminmax_avx_params + init: xnn_init_f32_scaleminmax_scalar_params # x86 AVX512F - name: xnn_f32_rsum_ukernel__avx512f_u16