Skip to content

Commit

Permalink
Remove target specific versions of rsum, rdsum params
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 658913770
  • Loading branch information
dsharletg authored and xnnpack-bot committed Aug 3, 2024
1 parent c4a28da commit 00b54b6
Show file tree
Hide file tree
Showing 136 changed files with 490 additions and 519 deletions.
8 changes: 4 additions & 4 deletions bench/f16-f32acc-rdsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c16,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
Expand All @@ -62,7 +62,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c32,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
Expand All @@ -72,7 +72,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c64,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
Expand All @@ -82,7 +82,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c128,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
Expand Down
10 changes: 5 additions & 5 deletions bench/f16-f32acc-rsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u8,
xnn_f16_f32acc_rsum_ukernel__f16c_u8,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -92,7 +92,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u16_acc2,
xnn_f16_f32acc_rsum_ukernel__f16c_u16_acc2,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -102,7 +102,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u24_acc3,
xnn_f16_f32acc_rsum_ukernel__f16c_u24_acc3,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -112,7 +112,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u32_acc2,
xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc2,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -122,7 +122,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum, f16c_u32_acc4,
xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4,
xnn_init_f16_f32acc_scale_avx_params,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand Down
12 changes: 6 additions & 6 deletions bench/f32-rdsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rdsum, sse_c16,
xnn_f32_rdsum_ukernel_7p7x__sse_c16,
xnn_init_f32_scale_sse_params)
xnn_init_f32_scale_scalar_params)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
Expand All @@ -67,7 +67,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rdsum, sse_c32,
xnn_f32_rdsum_ukernel_7p7x__sse_c32,
xnn_init_f32_scale_sse_params)
xnn_init_f32_scale_scalar_params)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
Expand All @@ -76,7 +76,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rdsum, sse_c64,
xnn_f32_rdsum_ukernel_7p7x__sse_c64,
xnn_init_f32_scale_sse_params)
xnn_init_f32_scale_scalar_params)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
Expand All @@ -85,7 +85,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rdsum, avx_c16,
xnn_f32_rdsum_ukernel_7p7x__avx_c16,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRDSUM)
->UseRealTime();
Expand All @@ -95,7 +95,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rdsum, avx_c32,
xnn_f32_rdsum_ukernel_7p7x__avx_c32,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRDSUM)
->UseRealTime();
Expand All @@ -105,7 +105,7 @@ BENCHMARK_CAPTURE(f32_rdsum, scalar_c4,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rdsum, avx_c64,
xnn_f32_rdsum_ukernel_7p7x__avx_c64,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRDSUM)
->UseRealTime();
Expand Down
10 changes: 5 additions & 5 deletions bench/f32-rsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rsum, avx_u8,
xnn_f32_rsum_ukernel__avx_u8,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -127,7 +127,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rsum, avx_u16_acc2,
xnn_f32_rsum_ukernel__avx_u16_acc2,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -137,7 +137,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rsum, avx_u24_acc3,
xnn_f32_rsum_ukernel__avx_u24_acc3,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -147,7 +147,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rsum, avx_u32_acc2,
xnn_f32_rsum_ukernel__avx_u32_acc2,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand All @@ -157,7 +157,7 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_rsum, avx_u32_acc4,
xnn_f32_rsum_ukernel__avx_u32_acc4,
xnn_init_f32_scale_avx_params,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkRSUM)
->UseRealTime();
Expand Down
12 changes: 8 additions & 4 deletions src/amalgam/gen/avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -3655,12 +3655,14 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c32(
float* output,
const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{
const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};

assert(rows != 0);
assert(channels != 0);
assert(input != NULL);
assert(output != NULL);

const __m256 vscale = _mm256_set1_ps(params->avx.scale);
const __m256 vscale = _mm256_set1_ps(params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 32; channels -= 32) {
Expand Down Expand Up @@ -3834,7 +3836,7 @@ void xnn_f32_rdsum_ukernel_7p7x__avx_c32(
}

if (remainder) {
vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - (channels & 0x7) * sizeof(float)));
vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - (channels & 0x7) * sizeof(float)));
vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i0[num_full_chunks*8], vmask), vacc[num_full_chunks]);
vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i1[num_full_chunks*8], vmask), vacc[num_full_chunks]);
vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i2[num_full_chunks*8], vmask), vacc[num_full_chunks]);
Expand Down Expand Up @@ -4016,6 +4018,8 @@ void xnn_f32_rsum_ukernel__avx_u32_acc4(
float* output,
const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{
const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};

assert(batch != 0);
assert(batch % sizeof(float) == 0);
assert(input != NULL);
Expand Down Expand Up @@ -4049,14 +4053,14 @@ void xnn_f32_rsum_ukernel__avx_u32_acc4(
if XNN_UNLIKELY(batch != 0) {
assert(batch >= 1 * sizeof(float));
assert(batch <= 7 * sizeof(float));
const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - batch));
const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - batch));
const __m256 vt = _mm256_maskload_ps(input, vmask);
vacc0 = _mm256_add_ps(vacc0, vt);
}
__m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1));
vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->avx.scale));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->scale));
*output += _mm_cvtss_f32(vacc);
}

Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/avx512f.c
Original file line number Diff line number Diff line change
Expand Up @@ -2012,7 +2012,7 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c64(
assert(input != NULL);
assert(output != NULL);

const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
const __m512 vscale = _mm512_set1_ps(params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 64; channels -= 64) {
Expand Down Expand Up @@ -2412,7 +2412,7 @@ void xnn_f32_rsum_ukernel__avx512f_u64_acc4(
__m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1));
vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->scalar.scale));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->scale));
*output += _mm_cvtss_f32(vacc);
}

Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/avx512skx.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c64(
assert(input != NULL);
assert(output != NULL);

const __m512 vscale = _mm512_set1_ps(params->scalar.scale);
const __m512 vscale = _mm512_set1_ps(params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 64; channels -= 64) {
Expand Down Expand Up @@ -343,7 +343,7 @@ void xnn_f16_f32acc_rsum_ukernel__avx512skx_u64_acc4(
__m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc256), _mm256_extractf128_ps(vacc256, 1));
vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->scalar.scale));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->scale));

float vout = _mm_cvtss_f32(vacc);
*output += vout;
Expand Down
8 changes: 5 additions & 3 deletions src/amalgam/gen/f16c.c
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32(
assert(input != NULL);
assert(output != NULL);

const __m256 vscale = _mm256_set1_ps(params->avx.scale);
const __m256 vscale = _mm256_set1_ps(params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 32; channels -= 32) {
Expand Down Expand Up @@ -837,6 +837,8 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4(
float* output,
const union xnn_f16_f32acc_scale_params params[restrict XNN_MIN_ELEMENTS(1)])
{
const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};

assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
Expand Down Expand Up @@ -871,7 +873,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4(
if XNN_UNLIKELY(batch != 0) {
assert(batch >= 1 * sizeof(uint16_t));
assert(batch <= 7 * sizeof(uint16_t));
const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &params->avx.mask_table[7] - batch));
const __m128i vmask = _mm_loadu_si128((const __m128i*) ((uintptr_t) &mask_table[7] - batch));
const __m128i vh = _mm_castps_si128(_mm_maskload_ps((const float*) i, vmask));
const __m256 vt = _mm256_cvtph_ps(vh);
vacc0 = _mm256_add_ps(vacc0, vt);
Expand All @@ -885,7 +887,7 @@ void xnn_f16_f32acc_rsum_ukernel__f16c_u32_acc4(
__m128 vacc = _mm_add_ps(_mm256_castps256_ps128(vacc0), _mm256_extractf128_ps(vacc0, 1));
vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
vacc = _mm_add_ss(vacc, _mm_movehdup_ps(vacc));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->avx.scale));
vacc = _mm_mul_ss(vacc, _mm_load_ss(&params->scale));

float vout = _mm_cvtss_f32(vacc);
*output += vout;
Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -8055,7 +8055,7 @@ void xnn_f32_rdsum_ukernel_7p7x__neon_c16(
assert(input != NULL);
assert(output != NULL);

const float32x4_t vscale = vdupq_n_f32(params->scalar.scale);
const float32x4_t vscale = vdupq_n_f32(params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 16; channels -= 16) {
Expand Down Expand Up @@ -8402,7 +8402,7 @@ void xnn_f32_rsum_ukernel__neon_u16_acc4(
const float32x4_t vt = vld1q_f32(input); input += 4;
vacc0 = vaddq_f32(vacc0, vt);
}
const float32x2_t vscale = vld1_dup_f32(&params->scalar.scale);
const float32x2_t vscale = vld1_dup_f32(&params->scale);
float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0));
if XNN_UNLIKELY(batch & (2 * sizeof(float))) {
const float32x2_t vt = vld1_f32(input); input += 2;
Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/neonfp16arith.c
Original file line number Diff line number Diff line change
Expand Up @@ -4276,7 +4276,7 @@ void xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c16(
assert(input != NULL);
assert(output != NULL);

const float32x4_t vscale = vdupq_n_f32(params->scalar.scale);
const float32x4_t vscale = vld1q_dup_f32(&params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 16; channels -= 16) {
Expand Down Expand Up @@ -4532,7 +4532,7 @@ void xnn_f16_f32acc_rsum_ukernel__neonfp16arith_u32_acc4(
const float32x4_t vt = vcvt_f32_f16(vh);
vacc0 = vaddq_f32(vacc0, vt);
}
const float32x2_t vscale = vld1_dup_f32(&params->scalar.scale);
const float32x2_t vscale = vld1_dup_f32(&params->scale);
float32x2_t vacc = vadd_f32(vget_low_f32(vacc0), vget_high_f32(vacc0));
if XNN_UNLIKELY(batch & (2 * sizeof(uint16_t))) {
const float16x4_t vh = vreinterpret_f16_u32(vld1_dup_u32((const void*) i)); i += 2;
Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/scalar.c
Original file line number Diff line number Diff line change
Expand Up @@ -10506,7 +10506,7 @@ void xnn_f32_rdsum_ukernel_7p7x__scalar_c4(
assert(input != NULL);
assert(output != NULL);

const float vscale = params->scalar.scale;
const float vscale = params->scale;

size_t input_increment = 7 * input_stride;
for (; channels >= 4; channels -= 4) {
Expand Down Expand Up @@ -10802,7 +10802,7 @@ void xnn_f32_rsum_ukernel__scalar_u4_acc4(
batch -= sizeof(float);
} while (batch != 0);
}
const float vscale = params->scalar.scale;
const float vscale = params->scale;
vacc0 *= vscale;
*output += vacc0;
}
Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -7389,7 +7389,7 @@ void xnn_f32_rdsum_ukernel_7p7x__sse_c16(
assert(input != NULL);
assert(output != NULL);

const __m128 vscale = _mm_load_ps(params->sse.scale);
const __m128 vscale = _mm_set1_ps(params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 16; channels -= 16) {
Expand Down Expand Up @@ -7759,7 +7759,7 @@ void xnn_f32_rsum_ukernel__sse_u16_acc4(
} while (batch != 0);
}
vacc0 = _mm_add_ss(vacc0, _mm_shuffle_ps(vacc0, vacc0, _MM_SHUFFLE(1, 1, 1, 1)));
vacc0 = _mm_mul_ss(vacc0, _mm_load_ss(&params->scalar.scale));
vacc0 = _mm_mul_ss(vacc0, _mm_load_ss(&params->scale));
*output += _mm_cvtss_f32(vacc0);
}

Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/wasmsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -22409,7 +22409,7 @@ void xnn_f32_rdsum_ukernel_7p7x__wasmsimd_c16(
assert(input != NULL);
assert(output != NULL);

const v128_t vscale = wasm_v128_load32_splat(&params->scalar.scale);
const v128_t vscale = wasm_v128_load32_splat(&params->scale);

size_t input_increment = 7 * input_stride;
for (; channels >= 16; channels -= 16) {
Expand Down Expand Up @@ -22779,7 +22779,7 @@ void xnn_f32_rsum_ukernel__wasmsimd_u16_acc4(
const v128_t vt = wasm_v128_load32_zero(input);
vacc0 = wasm_f32x4_add(vacc0, vt);
}
const v128_t vscale = wasm_v128_load32_zero(&params->scalar.scale);
const v128_t vscale = wasm_v128_load32_zero(&params->scale);
vacc0 = wasm_f32x4_mul(vacc0, vscale);
*output += wasm_f32x4_extract_lane(vacc0, 0);
}
Expand Down
Loading

0 comments on commit 00b54b6

Please sign in to comment.