From c2f37bed33e0c38f7705ad24871f4e7ddad8bd24 Mon Sep 17 00:00:00 2001 From: Fanchen Kong Date: Tue, 23 Jul 2024 13:36:17 +0800 Subject: [PATCH] Add qd8-f32-qc8w gemm microkernels for Wasm relaxed SIMD unsigned and signed dot product --- bench/qd8-f32-qc8w-gemm.cc | 132 +++++++ cmake/gen/wasmrelaxedsimd_microkernels.cmake | 12 + gen/wasmrelaxedsimd_microkernels.bzl | 12 + scripts/generate-qs8-gemm.sh | 15 + ...qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c | 4 +- ...d8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c | 123 +++++++ ...qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c | 161 +++++++++ ...d8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c | 162 +++++++++ ...qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c | 4 +- ...d8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c | 157 +++++++++ ...qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c | 216 ++++++++++++ ...d8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c | 217 ++++++++++++ ...qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c | 4 +- ...d8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c | 191 ++++++++++ ...qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c | 271 +++++++++++++++ ...d8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c | 272 +++++++++++++++ ...qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c | 4 +- ...d8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c | 225 ++++++++++++ ...qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c | 326 +++++++++++++++++ ...d8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c | 327 ++++++++++++++++++ src/qs8-gemm/MRx4c16-wasmdot.c.in | 49 ++- src/xnnpack/gemm.h | 12 + test/qd8-f32-qc8w-gemm-minmax-2.cc | 22 ++ test/qd8-f32-qc8w-gemm-minmax-3.cc | 19 + test/qd8-f32-qc8w-gemm-minmax-4.cc | 152 ++++++++ test/qd8-f32-qc8w-gemm-minmax.cc | 38 ++ test/qd8-f32-qc8w-gemm-minmax.yaml | 64 +++- 27 files changed, 3159 insertions(+), 32 deletions(-) create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c create mode 100644 src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c diff --git a/bench/qd8-f32-qc8w-gemm.cc b/bench/qd8-f32-qc8w-gemm.cc index c5f4beb5492..cc5d8edd390 100644 --- a/bench/qd8-f32-qc8w-gemm.cc +++ b/bench/qd8-f32-qc8w-gemm.cc @@ -2626,6 +2626,39 @@ BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmsdot) + static void qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/1, /*nr=*/4, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/1, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/1, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot) + static void qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot, @@ -2637,6 +2670,39 @@ BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot) + static void qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/2, /*nr=*/4, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/2, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/2, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot) + static void qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot, @@ -2648,6 +2714,39 @@ BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot) + static void qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/3, /*nr=*/4, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/3, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/3, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot) + static void qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot, @@ -2658,6 +2757,39 @@ } BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/4, /*nr=*/4, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/4, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/4, /*nr=*/8, /*kr=*/16, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot) #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/cmake/gen/wasmrelaxedsimd_microkernels.cmake b/cmake/gen/wasmrelaxedsimd_microkernels.cmake index a44b9e21aa9..2e8b319b6f0 100644 --- a/cmake/gen/wasmrelaxedsimd_microkernels.cmake +++ b/cmake/gen/wasmrelaxedsimd_microkernels.cmake @@ -431,9 +431,21 @@ SET(ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u20.c src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u24.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x4c16-minmax-wasmsdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x4c16-minmax-wasmsdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-3x4c16-minmax-wasmsdot.c diff --git a/gen/wasmrelaxedsimd_microkernels.bzl b/gen/wasmrelaxedsimd_microkernels.bzl index dc6abbcf1cf..edfc51119b7 100644 --- a/gen/wasmrelaxedsimd_microkernels.bzl +++ b/gen/wasmrelaxedsimd_microkernels.bzl @@ -427,9 +427,21 @@ ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u20.c", "src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u24.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x4c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x4c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-3x4c16-minmax-wasmsdot.c", diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh index a5acb1093ad..864e38fe893 100755 --- a/scripts/generate-qs8-gemm.sh +++ b/scripts/generate-qs8-gemm.sh @@ -271,6 +271,11 @@ tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=4 -D REQUANTIZATION= tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c & tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c & + tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=4 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c & tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=4 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c16-minmax-fp32-wasmusdot.c & tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=4 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c16-minmax-fp32-wasmusdot.c & @@ -281,6 +286,16 @@ tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=8 -D REQUANTIZATION=FP3 tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=8 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c16-minmax-fp32-wasmusdot.c & tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=8 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c16-minmax-fp32-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c & + +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c & + ################################### ARM NEON ################################## tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION= -D DATATYPE=QD8 -D ARMV8=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8-minmax-neon-mlal-lane.c & tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION= -D DATATYPE=QD8 -D ARMV8=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8-minmax-neon-mlal-lane.c & diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c index 9d8629c6573..73009c2985b 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c @@ -86,12 +86,12 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmsdot( vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); const v128_t vfilter_output_scale0123 = wasm_v128_load(w); - vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); const v128_t vbias0123 = wasm_v128_load(w); - vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..01feb4beec2 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c @@ -0,0 +1,123 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + w = (const int32_t*) w + 4; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + + w = (const int8_t*) w + 64; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + + if XNN_LIKELY(nc >= 4) { + wasm_v128_store(c0, vacc0x0123); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + nc -= 4; + } else { + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c new file mode 100644 index 00000000000..4d4972bfff6 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c @@ -0,0 +1,161 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_v128_load32_splat(&quantization_params[0].zero_point); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_load(a0); + a0 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..6856461d026 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c @@ -0,0 +1,162 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c index c6c4d50f440..33d622ca7d4 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c @@ -109,14 +109,14 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot( vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); - w = (const float*) w + 4; const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); - w = (const float*) w + 4; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..9e54da6704b --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c @@ -0,0 +1,157 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 2); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr != 2) { + a1 = a0; + c1 = c0; + } + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + w = (const int32_t*) w + 4; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + const v128_t va1 = wasm_v128_xor(wasm_v128_load(a1), vsign_mask); + a1 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + + w = (const int8_t*) w + 64; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + + if XNN_LIKELY(nc >= 4) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c1, vacc1x0123); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + + nc -= 4; + } else { + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c new file mode 100644 index 00000000000..f72df7bf8d0 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c @@ -0,0 +1,216 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 2); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr != 2) { + a1 = a0; + c1 = c0; + } + + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_v128_load32_splat(&quantization_params[0].zero_point); + const v128_t vinput_zero_point1 = wasm_v128_load32_splat(&quantization_params[1].zero_point); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc1x4 = wasm_i32x4_mul(vksum4, vinput_zero_point1); + v128_t vacc1x5 = wasm_i32x4_mul(vksum5, vinput_zero_point1); + v128_t vacc1x6 = wasm_i32x4_mul(vksum6, vinput_zero_point1); + v128_t vacc1x7 = wasm_i32x4_mul(vksum7, vinput_zero_point1); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_load(a0); + a0 += 16; + const v128_t va1 = wasm_v128_load(a1); + a1 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + vacc1x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va1, vacc1x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + vacc1x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va1, vacc1x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + vacc1x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va1, vacc1x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + vacc1x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va1, vacc1x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc1x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x4, vacc1x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x4, vacc1x6, 2, 6, 3, 7)); + const v128_t vacc1x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x5, vacc1x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x5, vacc1x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc1x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x46, vacc1x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x46, vacc1x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale1); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..b784d88af94 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c @@ -0,0 +1,217 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 2); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr != 2) { + a1 = a0; + c1 = c0; + } + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc1x4 = wasm_i32x4_mul(vksum4, vinput_zero_point1); + v128_t vacc1x5 = wasm_i32x4_mul(vksum5, vinput_zero_point1); + v128_t vacc1x6 = wasm_i32x4_mul(vksum6, vinput_zero_point1); + v128_t vacc1x7 = wasm_i32x4_mul(vksum7, vinput_zero_point1); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + const v128_t va1 = wasm_v128_xor(wasm_v128_load(a1), vsign_mask); + a1 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + vacc1x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va1, vacc1x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + vacc1x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va1, vacc1x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + vacc1x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va1, vacc1x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + vacc1x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va1, vacc1x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc1x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x4, vacc1x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x4, vacc1x6, 2, 6, 3, 7)); + const v128_t vacc1x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x5, vacc1x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x5, vacc1x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc1x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x46, vacc1x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x46, vacc1x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale1); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c index 3ab76d3b9f9..ee3c1eee4bf 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c @@ -132,16 +132,16 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot( vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); - w = (const float*) w + 4; const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); - w = (const float*) w + 4; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..7531c802810 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c @@ -0,0 +1,191 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + const v128_t vinput_zero_point2 = wasm_i32x4_splat((int32_t) quantization_params[2].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc2x0 = wasm_i32x4_mul(vksum0, vinput_zero_point2); + v128_t vacc2x1 = wasm_i32x4_mul(vksum1, vinput_zero_point2); + v128_t vacc2x2 = wasm_i32x4_mul(vksum2, vinput_zero_point2); + v128_t vacc2x3 = wasm_i32x4_mul(vksum3, vinput_zero_point2); + w = (const int32_t*) w + 4; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + const v128_t va1 = wasm_v128_xor(wasm_v128_load(a1), vsign_mask); + a1 += 16; + const v128_t va2 = wasm_v128_xor(wasm_v128_load(a2), vsign_mask); + a2 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + vacc2x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va2, vacc2x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + vacc2x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va2, vacc2x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + vacc2x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va2, vacc2x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + vacc2x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va2, vacc2x3); + + w = (const int8_t*) w + 64; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7)); + const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + + if XNN_LIKELY(nc >= 4) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c2, vacc2x0123); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + + nc -= 4; + } else { + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c new file mode 100644 index 00000000000..1828b53e36b --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c @@ -0,0 +1,271 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_v128_load32_splat(&quantization_params[0].zero_point); + const v128_t vinput_zero_point1 = wasm_v128_load32_splat(&quantization_params[1].zero_point); + const v128_t vinput_zero_point2 = wasm_v128_load32_splat(&quantization_params[2].zero_point); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc1x4 = wasm_i32x4_mul(vksum4, vinput_zero_point1); + v128_t vacc1x5 = wasm_i32x4_mul(vksum5, vinput_zero_point1); + v128_t vacc1x6 = wasm_i32x4_mul(vksum6, vinput_zero_point1); + v128_t vacc1x7 = wasm_i32x4_mul(vksum7, vinput_zero_point1); + v128_t vacc2x0 = wasm_i32x4_mul(vksum0, vinput_zero_point2); + v128_t vacc2x1 = wasm_i32x4_mul(vksum1, vinput_zero_point2); + v128_t vacc2x2 = wasm_i32x4_mul(vksum2, vinput_zero_point2); + v128_t vacc2x3 = wasm_i32x4_mul(vksum3, vinput_zero_point2); + v128_t vacc2x4 = wasm_i32x4_mul(vksum4, vinput_zero_point2); + v128_t vacc2x5 = wasm_i32x4_mul(vksum5, vinput_zero_point2); + v128_t vacc2x6 = wasm_i32x4_mul(vksum6, vinput_zero_point2); + v128_t vacc2x7 = wasm_i32x4_mul(vksum7, vinput_zero_point2); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_load(a0); + a0 += 16; + const v128_t va1 = wasm_v128_load(a1); + a1 += 16; + const v128_t va2 = wasm_v128_load(a2); + a2 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + vacc2x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va2, vacc2x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + vacc2x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va2, vacc2x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + vacc2x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va2, vacc2x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + vacc2x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va2, vacc2x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + vacc1x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va1, vacc1x4); + vacc2x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va2, vacc2x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + vacc1x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va1, vacc1x5); + vacc2x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va2, vacc2x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + vacc1x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va1, vacc1x6); + vacc2x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va2, vacc2x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + vacc1x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va1, vacc1x7); + vacc2x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va2, vacc2x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc1x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x4, vacc1x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x4, vacc1x6, 2, 6, 3, 7)); + const v128_t vacc1x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x5, vacc1x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x5, vacc1x7, 2, 6, 3, 7)); + const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7)); + const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7)); + const v128_t vacc2x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x4, vacc2x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x4, vacc2x6, 2, 6, 3, 7)); + const v128_t vacc2x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x5, vacc2x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x5, vacc2x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc1x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x46, vacc1x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x46, vacc1x57, 2, 6, 3, 7)); + v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); + v128_t vacc2x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x46, vacc2x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x46, vacc2x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale1); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vinput_scale2); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + vacc2x4567 = wasm_f32x4_pmax(vacc2x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + vacc2x4567 = wasm_f32x4_pmin(vacc2x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + wasm_v128_store(c2, vacc2x0123); + wasm_v128_store(c2 + 4, vacc2x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x4567; + c2 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..1b4c73ca594 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c @@ -0,0 +1,272 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + const v128_t vinput_zero_point2 = wasm_i32x4_splat((int32_t) quantization_params[2].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc1x4 = wasm_i32x4_mul(vksum4, vinput_zero_point1); + v128_t vacc1x5 = wasm_i32x4_mul(vksum5, vinput_zero_point1); + v128_t vacc1x6 = wasm_i32x4_mul(vksum6, vinput_zero_point1); + v128_t vacc1x7 = wasm_i32x4_mul(vksum7, vinput_zero_point1); + v128_t vacc2x0 = wasm_i32x4_mul(vksum0, vinput_zero_point2); + v128_t vacc2x1 = wasm_i32x4_mul(vksum1, vinput_zero_point2); + v128_t vacc2x2 = wasm_i32x4_mul(vksum2, vinput_zero_point2); + v128_t vacc2x3 = wasm_i32x4_mul(vksum3, vinput_zero_point2); + v128_t vacc2x4 = wasm_i32x4_mul(vksum4, vinput_zero_point2); + v128_t vacc2x5 = wasm_i32x4_mul(vksum5, vinput_zero_point2); + v128_t vacc2x6 = wasm_i32x4_mul(vksum6, vinput_zero_point2); + v128_t vacc2x7 = wasm_i32x4_mul(vksum7, vinput_zero_point2); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + const v128_t va1 = wasm_v128_xor(wasm_v128_load(a1), vsign_mask); + a1 += 16; + const v128_t va2 = wasm_v128_xor(wasm_v128_load(a2), vsign_mask); + a2 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + vacc2x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va2, vacc2x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + vacc2x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va2, vacc2x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + vacc2x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va2, vacc2x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + vacc2x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va2, vacc2x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + vacc1x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va1, vacc1x4); + vacc2x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va2, vacc2x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + vacc1x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va1, vacc1x5); + vacc2x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va2, vacc2x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + vacc1x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va1, vacc1x6); + vacc2x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va2, vacc2x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + vacc1x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va1, vacc1x7); + vacc2x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va2, vacc2x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc1x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x4, vacc1x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x4, vacc1x6, 2, 6, 3, 7)); + const v128_t vacc1x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x5, vacc1x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x5, vacc1x7, 2, 6, 3, 7)); + const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7)); + const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7)); + const v128_t vacc2x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x4, vacc2x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x4, vacc2x6, 2, 6, 3, 7)); + const v128_t vacc2x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x5, vacc2x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x5, vacc2x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc1x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x46, vacc1x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x46, vacc1x57, 2, 6, 3, 7)); + v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); + v128_t vacc2x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x46, vacc2x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x46, vacc2x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale1); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vinput_scale2); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + vacc2x4567 = wasm_f32x4_pmax(vacc2x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + vacc2x4567 = wasm_f32x4_pmin(vacc2x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + wasm_v128_store(c2, vacc2x0123); + wasm_v128_store(c2 + 4, vacc2x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x4567; + c2 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c index b49de215ebf..994e89ba0b5 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c @@ -155,18 +155,18 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot( vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vinput_scale3); const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vfilter_output_scale0123); - w = (const float*) w + 4; const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); vacc3x0123 = wasm_f32x4_add(vacc3x0123, vbias0123); - w = (const float*) w + 4; const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..c1d82ea511d --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c @@ -0,0 +1,225 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + const v128_t vinput_zero_point2 = wasm_i32x4_splat((int32_t) quantization_params[2].zero_point + 128); + const v128_t vinput_zero_point3 = wasm_i32x4_splat((int32_t) quantization_params[3].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc2x0 = wasm_i32x4_mul(vksum0, vinput_zero_point2); + v128_t vacc2x1 = wasm_i32x4_mul(vksum1, vinput_zero_point2); + v128_t vacc2x2 = wasm_i32x4_mul(vksum2, vinput_zero_point2); + v128_t vacc2x3 = wasm_i32x4_mul(vksum3, vinput_zero_point2); + v128_t vacc3x0 = wasm_i32x4_mul(vksum0, vinput_zero_point3); + v128_t vacc3x1 = wasm_i32x4_mul(vksum1, vinput_zero_point3); + v128_t vacc3x2 = wasm_i32x4_mul(vksum2, vinput_zero_point3); + v128_t vacc3x3 = wasm_i32x4_mul(vksum3, vinput_zero_point3); + w = (const int32_t*) w + 4; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + const v128_t va1 = wasm_v128_xor(wasm_v128_load(a1), vsign_mask); + a1 += 16; + const v128_t va2 = wasm_v128_xor(wasm_v128_load(a2), vsign_mask); + a2 += 16; + const v128_t va3 = wasm_v128_xor(wasm_v128_load(a3), vsign_mask); + a3 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + vacc2x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va2, vacc2x0); + vacc3x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va3, vacc3x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + vacc2x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va2, vacc2x1); + vacc3x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va3, vacc3x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + vacc2x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va2, vacc2x2); + vacc3x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va3, vacc3x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + vacc2x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va2, vacc2x3); + vacc3x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va3, vacc3x3); + + w = (const int8_t*) w + 64; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7)); + const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7)); + const v128_t vacc3x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x0, vacc3x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0, vacc3x2, 2, 6, 3, 7)); + const v128_t vacc3x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x1, vacc3x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x1, vacc3x3, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); + v128_t vacc3x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x02, vacc3x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x02, vacc3x13, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + const v128_t vinput_scale3 = wasm_v128_load32_splat(&quantization_params[3].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vinput_scale3); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vfilter_output_scale0123); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vbias0123); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + vacc3x0123 = wasm_f32x4_pmax(vacc3x0123, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + vacc3x0123 = wasm_f32x4_pmin(vacc3x0123, vmax); + + if XNN_LIKELY(nc >= 4) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c2, vacc2x0123); + wasm_v128_store(c3, vacc3x0123); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + nc -= 4; + } else { + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + wasm_v128_store64_lane(c3, vacc3x0123, 0); + vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); + c3 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + wasm_v128_store32_lane(c3, vacc3x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c new file mode 100644 index 00000000000..8c7b6e29d6c --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c @@ -0,0 +1,326 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_v128_load32_splat(&quantization_params[0].zero_point); + const v128_t vinput_zero_point1 = wasm_v128_load32_splat(&quantization_params[1].zero_point); + const v128_t vinput_zero_point2 = wasm_v128_load32_splat(&quantization_params[2].zero_point); + const v128_t vinput_zero_point3 = wasm_v128_load32_splat(&quantization_params[3].zero_point); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc1x4 = wasm_i32x4_mul(vksum4, vinput_zero_point1); + v128_t vacc1x5 = wasm_i32x4_mul(vksum5, vinput_zero_point1); + v128_t vacc1x6 = wasm_i32x4_mul(vksum6, vinput_zero_point1); + v128_t vacc1x7 = wasm_i32x4_mul(vksum7, vinput_zero_point1); + v128_t vacc2x0 = wasm_i32x4_mul(vksum0, vinput_zero_point2); + v128_t vacc2x1 = wasm_i32x4_mul(vksum1, vinput_zero_point2); + v128_t vacc2x2 = wasm_i32x4_mul(vksum2, vinput_zero_point2); + v128_t vacc2x3 = wasm_i32x4_mul(vksum3, vinput_zero_point2); + v128_t vacc2x4 = wasm_i32x4_mul(vksum4, vinput_zero_point2); + v128_t vacc2x5 = wasm_i32x4_mul(vksum5, vinput_zero_point2); + v128_t vacc2x6 = wasm_i32x4_mul(vksum6, vinput_zero_point2); + v128_t vacc2x7 = wasm_i32x4_mul(vksum7, vinput_zero_point2); + v128_t vacc3x0 = wasm_i32x4_mul(vksum0, vinput_zero_point3); + v128_t vacc3x1 = wasm_i32x4_mul(vksum1, vinput_zero_point3); + v128_t vacc3x2 = wasm_i32x4_mul(vksum2, vinput_zero_point3); + v128_t vacc3x3 = wasm_i32x4_mul(vksum3, vinput_zero_point3); + v128_t vacc3x4 = wasm_i32x4_mul(vksum4, vinput_zero_point3); + v128_t vacc3x5 = wasm_i32x4_mul(vksum5, vinput_zero_point3); + v128_t vacc3x6 = wasm_i32x4_mul(vksum6, vinput_zero_point3); + v128_t vacc3x7 = wasm_i32x4_mul(vksum7, vinput_zero_point3); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_load(a0); + a0 += 16; + const v128_t va1 = wasm_v128_load(a1); + a1 += 16; + const v128_t va2 = wasm_v128_load(a2); + a2 += 16; + const v128_t va3 = wasm_v128_load(a3); + a3 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + vacc2x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va2, vacc2x0); + vacc3x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va3, vacc3x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + vacc2x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va2, vacc2x1); + vacc3x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va3, vacc3x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + vacc2x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va2, vacc2x2); + vacc3x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va3, vacc3x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + vacc2x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va2, vacc2x3); + vacc3x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va3, vacc3x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + vacc1x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va1, vacc1x4); + vacc2x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va2, vacc2x4); + vacc3x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va3, vacc3x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + vacc1x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va1, vacc1x5); + vacc2x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va2, vacc2x5); + vacc3x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va3, vacc3x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + vacc1x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va1, vacc1x6); + vacc2x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va2, vacc2x6); + vacc3x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va3, vacc3x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + vacc1x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va1, vacc1x7); + vacc2x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va2, vacc2x7); + vacc3x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va3, vacc3x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc1x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x4, vacc1x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x4, vacc1x6, 2, 6, 3, 7)); + const v128_t vacc1x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x5, vacc1x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x5, vacc1x7, 2, 6, 3, 7)); + const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7)); + const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7)); + const v128_t vacc2x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x4, vacc2x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x4, vacc2x6, 2, 6, 3, 7)); + const v128_t vacc2x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x5, vacc2x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x5, vacc2x7, 2, 6, 3, 7)); + const v128_t vacc3x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x0, vacc3x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0, vacc3x2, 2, 6, 3, 7)); + const v128_t vacc3x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x1, vacc3x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x1, vacc3x3, 2, 6, 3, 7)); + const v128_t vacc3x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x4, vacc3x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x4, vacc3x6, 2, 6, 3, 7)); + const v128_t vacc3x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x5, vacc3x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x5, vacc3x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc1x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x46, vacc1x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x46, vacc1x57, 2, 6, 3, 7)); + v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); + v128_t vacc2x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x46, vacc2x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x46, vacc2x57, 2, 6, 3, 7)); + v128_t vacc3x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x02, vacc3x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x02, vacc3x13, 2, 6, 3, 7)); + v128_t vacc3x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x46, vacc3x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x46, vacc3x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + const v128_t vinput_scale3 = wasm_v128_load32_splat(&quantization_params[3].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale1); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vinput_scale2); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vinput_scale3); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vinput_scale3); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vfilter_output_scale4567); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vfilter_output_scale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vbias4567); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vbias0123); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + vacc2x4567 = wasm_f32x4_pmax(vacc2x4567, vmin); + vacc3x0123 = wasm_f32x4_pmax(vacc3x0123, vmin); + vacc3x4567 = wasm_f32x4_pmax(vacc3x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + vacc2x4567 = wasm_f32x4_pmin(vacc2x4567, vmax); + vacc3x0123 = wasm_f32x4_pmin(vacc3x0123, vmax); + vacc3x4567 = wasm_f32x4_pmin(vacc3x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + wasm_v128_store(c2, vacc2x0123); + wasm_v128_store(c2 + 4, vacc2x4567); + wasm_v128_store(c3, vacc3x0123); + wasm_v128_store(c3 + 4, vacc3x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x4567; + c2 += 4; + wasm_v128_store(c3, vacc3x0123); + vacc3x0123 = vacc3x4567; + c3 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + wasm_v128_store64_lane(c3, vacc3x0123, 0); + vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); + c3 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + wasm_v128_store32_lane(c3, vacc3x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c new file mode 100644 index 00000000000..f3c7df9cb8d --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c @@ -0,0 +1,327 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 16 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + do { + v128_t vksum0 = wasm_v128_load32_zero(w); + v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1); + v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2); + v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3); + v128_t vksum4 = wasm_v128_load32_zero((const int32_t*) w + 4); + v128_t vksum5 = wasm_v128_load32_zero((const int32_t*) w + 5); + v128_t vksum6 = wasm_v128_load32_zero((const int32_t*) w + 6); + v128_t vksum7 = wasm_v128_load32_zero((const int32_t*) w + 7); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + const v128_t vinput_zero_point2 = wasm_i32x4_splat((int32_t) quantization_params[2].zero_point + 128); + const v128_t vinput_zero_point3 = wasm_i32x4_splat((int32_t) quantization_params[3].zero_point + 128); + v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0); + v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0); + v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0); + v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0); + v128_t vacc0x4 = wasm_i32x4_mul(vksum4, vinput_zero_point0); + v128_t vacc0x5 = wasm_i32x4_mul(vksum5, vinput_zero_point0); + v128_t vacc0x6 = wasm_i32x4_mul(vksum6, vinput_zero_point0); + v128_t vacc0x7 = wasm_i32x4_mul(vksum7, vinput_zero_point0); + v128_t vacc1x0 = wasm_i32x4_mul(vksum0, vinput_zero_point1); + v128_t vacc1x1 = wasm_i32x4_mul(vksum1, vinput_zero_point1); + v128_t vacc1x2 = wasm_i32x4_mul(vksum2, vinput_zero_point1); + v128_t vacc1x3 = wasm_i32x4_mul(vksum3, vinput_zero_point1); + v128_t vacc1x4 = wasm_i32x4_mul(vksum4, vinput_zero_point1); + v128_t vacc1x5 = wasm_i32x4_mul(vksum5, vinput_zero_point1); + v128_t vacc1x6 = wasm_i32x4_mul(vksum6, vinput_zero_point1); + v128_t vacc1x7 = wasm_i32x4_mul(vksum7, vinput_zero_point1); + v128_t vacc2x0 = wasm_i32x4_mul(vksum0, vinput_zero_point2); + v128_t vacc2x1 = wasm_i32x4_mul(vksum1, vinput_zero_point2); + v128_t vacc2x2 = wasm_i32x4_mul(vksum2, vinput_zero_point2); + v128_t vacc2x3 = wasm_i32x4_mul(vksum3, vinput_zero_point2); + v128_t vacc2x4 = wasm_i32x4_mul(vksum4, vinput_zero_point2); + v128_t vacc2x5 = wasm_i32x4_mul(vksum5, vinput_zero_point2); + v128_t vacc2x6 = wasm_i32x4_mul(vksum6, vinput_zero_point2); + v128_t vacc2x7 = wasm_i32x4_mul(vksum7, vinput_zero_point2); + v128_t vacc3x0 = wasm_i32x4_mul(vksum0, vinput_zero_point3); + v128_t vacc3x1 = wasm_i32x4_mul(vksum1, vinput_zero_point3); + v128_t vacc3x2 = wasm_i32x4_mul(vksum2, vinput_zero_point3); + v128_t vacc3x3 = wasm_i32x4_mul(vksum3, vinput_zero_point3); + v128_t vacc3x4 = wasm_i32x4_mul(vksum4, vinput_zero_point3); + v128_t vacc3x5 = wasm_i32x4_mul(vksum5, vinput_zero_point3); + v128_t vacc3x6 = wasm_i32x4_mul(vksum6, vinput_zero_point3); + v128_t vacc3x7 = wasm_i32x4_mul(vksum7, vinput_zero_point3); + w = (const int32_t*) w + 8; + + size_t k = kc; + do { + const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask); + a0 += 16; + const v128_t va1 = wasm_v128_xor(wasm_v128_load(a1), vsign_mask); + a1 += 16; + const v128_t va2 = wasm_v128_xor(wasm_v128_load(a2), vsign_mask); + a2 += 16; + const v128_t va3 = wasm_v128_xor(wasm_v128_load(a3), vsign_mask); + a3 += 16; + + const v128_t vb0 = wasm_v128_load(w); + + vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0); + vacc1x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va1, vacc1x0); + vacc2x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va2, vacc2x0); + vacc3x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va3, vacc3x0); + const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16); + + vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1); + vacc1x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va1, vacc1x1); + vacc2x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va2, vacc2x1); + vacc3x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va3, vacc3x1); + const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32); + + vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2); + vacc1x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va1, vacc1x2); + vacc2x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va2, vacc2x2); + vacc3x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va3, vacc3x2); + const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48); + + vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3); + vacc1x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va1, vacc1x3); + vacc2x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va2, vacc2x3); + vacc3x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va3, vacc3x3); + const v128_t vb4 = wasm_v128_load((const int8_t*) w + 64); + + vacc0x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va0, vacc0x4); + vacc1x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va1, vacc1x4); + vacc2x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va2, vacc2x4); + vacc3x4 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4, va3, vacc3x4); + const v128_t vb5 = wasm_v128_load((const int8_t*) w + 80); + + vacc0x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va0, vacc0x5); + vacc1x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va1, vacc1x5); + vacc2x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va2, vacc2x5); + vacc3x5 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb5, va3, vacc3x5); + const v128_t vb6 = wasm_v128_load((const int8_t*) w + 96); + + vacc0x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va0, vacc0x6); + vacc1x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va1, vacc1x6); + vacc2x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va2, vacc2x6); + vacc3x6 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb6, va3, vacc3x6); + const v128_t vb7 = wasm_v128_load((const int8_t*) w + 112); + + vacc0x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va0, vacc0x7); + vacc1x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va1, vacc1x7); + vacc2x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va2, vacc2x7); + vacc3x7 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb7, va3, vacc3x7); + + w = (const int8_t*) w + 128; + k -= 16 * sizeof(int8_t); + } while (k != 0); + + const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7)); + const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7)); + const v128_t vacc0x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x4, vacc0x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x4, vacc0x6, 2, 6, 3, 7)); + const v128_t vacc0x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x5, vacc0x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x5, vacc0x7, 2, 6, 3, 7)); + const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7)); + const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7)); + const v128_t vacc1x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x4, vacc1x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x4, vacc1x6, 2, 6, 3, 7)); + const v128_t vacc1x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x5, vacc1x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x5, vacc1x7, 2, 6, 3, 7)); + const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7)); + const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7)); + const v128_t vacc2x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x4, vacc2x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x4, vacc2x6, 2, 6, 3, 7)); + const v128_t vacc2x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x5, vacc2x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x5, vacc2x7, 2, 6, 3, 7)); + const v128_t vacc3x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x0, vacc3x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x0, vacc3x2, 2, 6, 3, 7)); + const v128_t vacc3x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x1, vacc3x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x1, vacc3x3, 2, 6, 3, 7)); + const v128_t vacc3x46 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x4, vacc3x6, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x4, vacc3x6, 2, 6, 3, 7)); + const v128_t vacc3x57 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x5, vacc3x7, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x5, vacc3x7, 2, 6, 3, 7)); + + v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7)); + v128_t vacc0x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x46, vacc0x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x46, vacc0x57, 2, 6, 3, 7)); + v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7)); + v128_t vacc1x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x46, vacc1x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x46, vacc1x57, 2, 6, 3, 7)); + v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7)); + v128_t vacc2x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x46, vacc2x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x46, vacc2x57, 2, 6, 3, 7)); + v128_t vacc3x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x02, vacc3x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x02, vacc3x13, 2, 6, 3, 7)); + v128_t vacc3x4567 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc3x46, vacc3x57, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc3x46, vacc3x57, 2, 6, 3, 7)); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + const v128_t vinput_scale3 = wasm_v128_load32_splat(&quantization_params[3].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale1); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vinput_scale2); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vinput_scale3); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vinput_scale3); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vfilter_output_scale4567); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vfilter_output_scale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vfilter_output_scale4567); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vbias4567); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vbias0123); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vbias4567); + + const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + vacc2x4567 = wasm_f32x4_pmax(vacc2x4567, vmin); + vacc3x0123 = wasm_f32x4_pmax(vacc3x0123, vmin); + vacc3x4567 = wasm_f32x4_pmax(vacc3x4567, vmin); + + const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + vacc2x4567 = wasm_f32x4_pmin(vacc2x4567, vmax); + vacc3x0123 = wasm_f32x4_pmin(vacc3x0123, vmax); + vacc3x4567 = wasm_f32x4_pmin(vacc3x4567, vmax); + + if XNN_LIKELY(nc >= 8) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + wasm_v128_store(c2, vacc2x0123); + wasm_v128_store(c2 + 4, vacc2x4567); + wasm_v128_store(c3, vacc3x0123); + wasm_v128_store(c3 + 4, vacc3x4567); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + nc -= 8; + } else { + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x4567; + c2 += 4; + wasm_v128_store(c3, vacc3x0123); + vacc3x0123 = vacc3x4567; + c3 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + wasm_v128_store64_lane(c3, vacc3x0123, 0); + vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); + c3 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + wasm_v128_store32_lane(c3, vacc3x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-gemm/MRx4c16-wasmdot.c.in b/src/qs8-gemm/MRx4c16-wasmdot.c.in index ed197ef842b..54b36c18634 100644 --- a/src/qs8-gemm/MRx4c16-wasmdot.c.in +++ b/src/qs8-gemm/MRx4c16-wasmdot.c.in @@ -8,7 +8,7 @@ $assert DATATYPE == "QD8" or REQUANTIZATION == "FP32" $assert DATATYPE != "QD8" or not REQUANTIZATION $assert MR <= 4 $assert NR == 4 or NR == 8 -$assert (SDOT == 0 and DATATYPE == "QC8") or SDOT == 1 +$assert (SDOT == 0 and (DATATYPE in ["QC8", "QD8"])) or SDOT == 1 #include #include @@ -72,13 +72,14 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x${NR}c const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); do { $if DATATYPE == "QD8": - $assert NR != 8 - $assert SDOT == 1 v128_t vksum0 = wasm_v128_load32_zero(w); $for N in range(1, NR): v128_t vksum${N} = wasm_v128_load32_zero((const int32_t*) w + ${N}); $for M in range(MR): - const v128_t vinput_zero_point${M} = wasm_v128_load32_splat(&quantization_params[${M}].zero_point); + $if SDOT: + const v128_t vinput_zero_point${M} = wasm_v128_load32_splat(&quantization_params[${M}].zero_point); + $else: + const v128_t vinput_zero_point${M} = wasm_i32x4_splat((int32_t) quantization_params[${M}].zero_point + 128); $for M in range(MR): $for N in range(NR): v128_t vacc${M}x${N} = wasm_i32x4_mul(vksum${N}, vinput_zero_point${M}); @@ -131,29 +132,38 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x${NR}c const v128_t vinput_scale${M} = wasm_v128_load32_splat(&quantization_params[${M}].inv_scale); $for M in range(MR): - vacc${M}x0123 = wasm_f32x4_mul(vacc${M}x0123, vinput_scale${M}); + $for N in range(0, NR, 4): + vacc${M}x${ABC[N:N+4]} = wasm_f32x4_mul(vacc${M}x${ABC[N:N+4]}, vinput_scale${M}); - const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + $for N in range(0, NR, 4): + const v128_t vfilter_output_scale${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; $for M in range(MR): - vacc${M}x0123 = wasm_f32x4_mul(vacc${M}x0123, vfilter_output_scale0123); - w = (const float*) w + 4; + $for N in range(0, NR, 4): + vacc${M}x${ABC[N:N+4]} = wasm_f32x4_mul(vacc${M}x${ABC[N:N+4]}, vfilter_output_scale${ABC[N:N+4]}); - const v128_t vbias0123 = wasm_v128_load(w); + $for N in range(0, NR, 4): + const v128_t vbias${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; $for M in range(MR): - vacc${M}x0123 = wasm_f32x4_add(vacc${M}x0123, vbias0123); - w = (const float*) w + 4; + $for N in range(0, NR, 4): + vacc${M}x${ABC[N:N+4]} = wasm_f32x4_add(vacc${M}x${ABC[N:N+4]}, vbias${ABC[N:N+4]}); const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min); $for M in range(MR): - vacc${M}x0123 = wasm_f32x4_pmax(vacc${M}x0123, vmin); + $for N in range(0, NR, 4): + vacc${M}x${ABC[N:N+4]} = wasm_f32x4_pmax(vacc${M}x${ABC[N:N+4]}, vmin); const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max); $for M in range(MR): - vacc${M}x0123 = wasm_f32x4_pmin(vacc${M}x0123, vmax); + $for N in range(0, NR, 4): + vacc${M}x${ABC[N:N+4]} = wasm_f32x4_pmin(vacc${M}x${ABC[N:N+4]}, vmax); - if XNN_LIKELY(nc >= 4) { + if XNN_LIKELY(nc >= ${NR}) { $for M in range(MR): - wasm_v128_store(c${M}, vacc${M}x0123); + wasm_v128_store(c${M}, vacc${M}x${ABC[0:4]}); + $for N in range(4, NR, 4): + wasm_v128_store(c${M} + ${N}, vacc${M}x${ABC[N:N+4]}); $for M in range(MR): a${M} = (const int8_t*) ((uintptr_t) a${M} - kc); @@ -161,8 +171,15 @@ void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x${NR}c $for M in range(MR): c${M} = (float*) ((uintptr_t) c${M} + cn_stride); - nc -= 4; + nc -= ${NR}; } else { + $if NR == 8: + if (nc & 4) { + $for M in range(MR): + wasm_v128_store(c${M}, vacc${M}x0123); + vacc${M}x0123 = vacc${M}x4567; + c${M} += 4; + } if (nc & 2) { $for M in range(MR): wasm_v128_store64_lane(c${M}, vacc${M}x0123, 0); diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h index 600576a1e2b..bc5bafcf2a3 100644 --- a/src/xnnpack/gemm.h +++ b/src/xnnpack/gemm.h @@ -2581,6 +2581,18 @@ DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_u DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot) DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot) DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot) DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__scalar) DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4__scalar) diff --git a/test/qd8-f32-qc8w-gemm-minmax-2.cc b/test/qd8-f32-qc8w-gemm-minmax-2.cc index b352845ff9b..191a9809484 100644 --- a/test/qd8-f32-qc8w-gemm-minmax-2.cc +++ b/test/qd8-f32-qc8w-gemm-minmax-2.cc @@ -1321,6 +1321,28 @@ std::vector CreateTests1( #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_WASMRELAXEDSIMD + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_1X8C16__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/1, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); +#endif // XNN_ARCH_WASMRELAXEDSIMD + + #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD INSTANTIATE_TEST_SUITE_P( QD8_F32_QC8W_GEMM_MINMAX_2X2__WASM, GemmTest, diff --git a/test/qd8-f32-qc8w-gemm-minmax-3.cc b/test/qd8-f32-qc8w-gemm-minmax-3.cc index e07156dcfa7..aec6ee5fba9 100644 --- a/test/qd8-f32-qc8w-gemm-minmax-3.cc +++ b/test/qd8-f32-qc8w-gemm-minmax-3.cc @@ -1220,6 +1220,25 @@ std::vector CreateTests1( return info.param.test_name; }); + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_3X8C16__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/3, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + INSTANTIATE_TEST_SUITE_P( QD8_F32_QC8W_GEMM_MINMAX_4X4C16__WASMSDOT, GemmTest, testing::ValuesIn(CreateTests1( diff --git a/test/qd8-f32-qc8w-gemm-minmax-4.cc b/test/qd8-f32-qc8w-gemm-minmax-4.cc index c96fdbdf6d4..54700c93d49 100644 --- a/test/qd8-f32-qc8w-gemm-minmax-4.cc +++ b/test/qd8-f32-qc8w-gemm-minmax-4.cc @@ -1754,6 +1754,101 @@ std::vector CreateTests1( #if XNN_ARCH_WASMRELAXEDSIMD + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_1X4C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/1, /*nr=*/4, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_1X8C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/1, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_2X4C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/2, /*nr=*/4, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_2X8C16__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/2, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_2X8C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/2, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + INSTANTIATE_TEST_SUITE_P( QD8_F32_QC8W_GEMM_MINMAX_3X4C16__WASMSDOT, GemmTest, testing::ValuesIn(CreateTests1( @@ -1772,6 +1867,63 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_4X4C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/4, /*nr=*/4, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_4X8C16__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/4, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_4X8C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/4, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qd8-f32-qc8w-gemm-minmax.cc b/test/qd8-f32-qc8w-gemm-minmax.cc index 6b362ddd45c..321aba99afa 100644 --- a/test/qd8-f32-qc8w-gemm-minmax.cc +++ b/test/qd8-f32-qc8w-gemm-minmax.cc @@ -1308,6 +1308,44 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_3X4C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/3, /*nr=*/4, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_3X8C16__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/16, + /*adj_k_block=*/16, + /*mr=*/3, /*nr=*/8, /*kr=*/16, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot, + xnn_init_f32_minmax_wasmsimd_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qd8-f32-qc8w-gemm-minmax.yaml b/test/qd8-f32-qc8w-gemm-minmax.yaml index b59d2be6167..ab034a42ef3 100644 --- a/test/qd8-f32-qc8w-gemm-minmax.yaml +++ b/test/qd8-f32-qc8w-gemm-minmax.yaml @@ -937,22 +937,70 @@ k-block: 8 # WAsm Relaxed SIMD -- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmsdot - init: xnn_init_f32_minmax_wasmsimd_params +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmsdot pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params k-block: 16 -- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot - init: xnn_init_f32_minmax_wasmsimd_params + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params k-block: 16 -- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot - init: xnn_init_f32_minmax_wasmsimd_params + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params k-block: 16 -- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot - init: xnn_init_f32_minmax_wasmsimd_params + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot + pack: xnn_pack_qs8_gemm_goi_w +- init: xnn_init_f32_minmax_wasmsimd_params + k-block: 16 + name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot + pack: xnn_pack_qs8_gemm_goi_w # WAsm - name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__wasm