diff --git a/bench/qd8-f32-qc8w-gemm.cc b/bench/qd8-f32-qc8w-gemm.cc index 5d2aa8108bd..906d7057611 100644 --- a/bench/qd8-f32-qc8w-gemm.cc +++ b/bench/qd8-f32-qc8w-gemm.cc @@ -3060,6 +3060,50 @@ } BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmsdot_u2) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot) + + static void qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2) #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/bench/qs8-qc8w-gemm-fp32.cc b/bench/qs8-qc8w-gemm-fp32.cc index 6dbef2c674f..795f8393620 100644 --- a/bench/qs8-qc8w-gemm-fp32.cc +++ b/bench/qs8-qc8w-gemm-fp32.cc @@ -4183,6 +4183,204 @@ } BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmUSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot) + + static void qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2(benchmark::State& state, const char* net) { + GEMMBenchmark(state, + xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + benchmark::utils::CheckWAsmSDOT); + } + + BENCHMARK_GEMM(qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2) #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/cmake/gen/wasmrelaxedsimd_microkernels.cmake b/cmake/gen/wasmrelaxedsimd_microkernels.cmake index 74b739f2566..509ffa887b5 100644 --- a/cmake/gen/wasmrelaxedsimd_microkernels.cmake +++ b/cmake/gen/wasmrelaxedsimd_microkernels.cmake @@ -452,6 +452,8 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c8-minmax-wasmusdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot-u2.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c8-minmax-wasmsdot-u2.c @@ -473,8 +475,12 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c8-minmax-wasmusdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot-u2.c + src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x4c16-minmax-wasmsdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x8c8-minmax-wasmusdot.c + src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot-u2.c + src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x4c16-minmax-wasmsdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x8c8-minmax-wasmsdot-u2.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x8c8-minmax-wasmusdot-u2.c @@ -487,12 +493,20 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x4c16-minmax-wasmsdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmsdot.c src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmusdot.c + src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c + src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c16-minmax-fp32-wasmusdot.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c16-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-wasmsdot-u2.c @@ -508,17 +522,35 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c16-minmax-fp32-wasmusdot.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x4c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x4c16-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c16-minmax-fp32-wasmusdot.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2.c + src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x4c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x4c16-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c16-minmax-fp32-wasmusdot.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x4c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x4c16-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-wasmsdot-u2.c @@ -534,12 +566,24 @@ SET(NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c16-minmax-fp32-wasmusdot.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c16-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c16-minmax-fp32-wasmsdot.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c16-minmax-fp32-wasmusdot.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c + src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u16.c src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u32-acc2.c diff --git a/gen/wasmrelaxedsimd_microkernels.bzl b/gen/wasmrelaxedsimd_microkernels.bzl index a2b10bd1ba5..e4a4a042d83 100644 --- a/gen/wasmrelaxedsimd_microkernels.bzl +++ b/gen/wasmrelaxedsimd_microkernels.bzl @@ -449,6 +449,8 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c8-minmax-wasmusdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot-u2.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c8-minmax-wasmsdot-u2.c", @@ -470,8 +472,12 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c8-minmax-wasmusdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot-u2.c", + "src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x4c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x8c8-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot-u2.c", + "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x4c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x8c8-minmax-wasmsdot-u2.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x8c8-minmax-wasmusdot-u2.c", @@ -484,12 +490,20 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x4c16-minmax-wasmsdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmsdot.c", "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x8c8-minmax-wasmusdot.c", + "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c", + "src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c", "src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c8-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x8c16-minmax-fp32-wasmusdot.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c16-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x8c8-minmax-fp32-wasmsdot-u2.c", @@ -505,17 +519,35 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c8-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c16-minmax-fp32-wasmusdot.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x4c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x4c16-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c8-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c16-minmax-fp32-wasmusdot.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2.c", + "src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x4c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x4c16-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c8-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x8c16-minmax-fp32-wasmusdot.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x4c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x4c16-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-2x8c8-minmax-fp32-wasmsdot-u2.c", @@ -531,12 +563,24 @@ NON_PROD_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [ "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c16-minmax-fp32-wasmusdot.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4c16-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmusdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c16-minmax-fp32-wasmsdot.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c16-minmax-fp32-wasmusdot.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c", + "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c", "src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-wasmrelaxedsimd.c", "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u16.c", "src/qs8-rsum/gen/qs8-rsum-wasmrelaxedsimd-u32-acc2.c", diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh index 32688e3afa7..d8418b2436a 100755 --- a/scripts/generate-qs8-gemm.sh +++ b/scripts/generate-qs8-gemm.sh @@ -354,6 +354,37 @@ tools/xngen src/qs8-gemm/c8-wasmdot.c.in -D MR=2 NR=8 -D UNROLL=1 -D REQUANTIZAT tools/xngen src/qs8-gemm/c8-wasmdot.c.in -D MR=3 NR=8 -D UNROLL=1 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c8-minmax-wasmsdot-u2.c & tools/xngen src/qs8-gemm/c8-wasmdot.c.in -D MR=4 NR=8 -D UNROLL=1 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c8-minmax-wasmsdot-u2.c & +### C4 micro-kernels +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=1 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=3 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=4 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=1 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D UNROLL=2 -D MR=3 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D UNROLL=2 -D MR=4 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=1 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=3 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=4 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=1 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=3 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=4 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=1 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D UNROLL=2 -D MR=3 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D UNROLL=2 -D MR=4 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=1 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=3 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=4 NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D MR=1 NR=16 -D ACCUMULATORS=1 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D MR=4 NR=16 -D ACCUMULATORS=1 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=1 NR=16 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot-u2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=4 NR=16 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot-u2.c & + ################################### ARM NEON ################################## tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION= -D DATATYPE=QD8 -D ARMV8=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8-minmax-neon-mlal-lane.c & tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION= -D DATATYPE=QD8 -D ARMV8=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8-minmax-neon-mlal-lane.c & diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh index 14d91d1ed30..9d859d1f519 100755 --- a/scripts/generate-qs8-igemm.sh +++ b/scripts/generate-qs8-igemm.sh @@ -272,6 +272,36 @@ tools/xngen src/qs8-igemm/c8-wasmdot.c.in -D MR=2 -D NR=8 -D UNROLL=1 -D REQUANT tools/xngen src/qs8-igemm/c8-wasmdot.c.in -D MR=3 -D NR=8 -D UNROLL=1 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x8c8-minmax-fp32-wasmsdot-u2.c & tools/xngen src/qs8-igemm/c8-wasmdot.c.in -D MR=4 -D NR=8 -D UNROLL=1 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x8c8-minmax-fp32-wasmsdot-u2.c & +### C4 micro-kernels +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=3 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D UNROLL=2 -D MR=3 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D UNROLL=2 -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2.c & + +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=3 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c & +tools/xngen src/qs8-gemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c & + +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=3 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot.c & + +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=3 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2.c & + +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=3 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=2 -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=1 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c & + +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=1 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D ACCUMULATORS=1 -D MR=4 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=1 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot-u2.c & +tools/xngen src/qs8-igemm/MRx16c4-wasmdot.c.in -D UNROLL=2 -D ACCUMULATORS=1 -D MR=4 -D NR=16 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c & + ################################## ARMv6 SIMD ################################# tools/xngen src/qs8-igemm/c4-armsimd32.c.in -D MR=1 -D NR=1 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x1c4-minmax-fp32-armsimd32.c & tools/xngen src/qs8-igemm/c4-armsimd32.c.in -D MR=1 -D NR=2 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -o src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x2c4-minmax-fp32-armsimd32.c & diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot-u2.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot-u2.c new file mode 100644 index 00000000000..59e2a48a315 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot-u2.c @@ -0,0 +1,198 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + v128_t vacc0x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point0); + v128_t vacc0x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point0); + v128_t vacc0x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point0); + v128_t vacc0x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point0); + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vinput_scale0); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vinput_scale0); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vinput_scale0); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vinput_scale0); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vfilter_output_scale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vfilter_output_scale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vfilter_output_scale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vbias0123); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vbias4567); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vbias89AB); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vbiasCDEF); + + vacc0x0x0123 = wasm_f32x4_pmax(vacc0x0x0123, vmin); + vacc0x0x4567 = wasm_f32x4_pmax(vacc0x0x4567, vmin); + vacc0x0x89AB = wasm_f32x4_pmax(vacc0x0x89AB, vmin); + vacc0x0xCDEF = wasm_f32x4_pmax(vacc0x0xCDEF, vmin); + + vacc0x0x0123 = wasm_f32x4_pmin(vacc0x0x0123, vmax); + vacc0x0x4567 = wasm_f32x4_pmin(vacc0x0x4567, vmax); + vacc0x0x89AB = wasm_f32x4_pmin(vacc0x0x89AB, vmax); + vacc0x0xCDEF = wasm_f32x4_pmin(vacc0x0xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c0, vacc0x0x0123); + wasm_v128_store(c0 + 4, vacc0x0x4567); + wasm_v128_store(c0 + 8, vacc0x0x89AB); + wasm_v128_store(c0 + 12, vacc0x0xCDEF); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x0x4567); + vacc0x0x4567 = vacc0x0xCDEF; + c0 += 4; + } + if (nc & 4) { + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0x0123, 0); + vacc0x0x0123 = wasm_v64x2_shuffle(vacc0x0x0123, vacc0x0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot.c new file mode 100644 index 00000000000..35143da4a42 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16c4-minmax-wasmusdot.c @@ -0,0 +1,169 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + v128_t vacc0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point0); + v128_t vacc0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point0); + v128_t vacc0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point0); + v128_t vacc0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point0); + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vinput_scale0); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vinput_scale0); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vfilter_output_scale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vbias89AB); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vbiasCDEF); + + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc0x89AB = wasm_f32x4_pmax(vacc0x89AB, vmin); + vacc0xCDEF = wasm_f32x4_pmax(vacc0xCDEF, vmin); + + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc0x89AB = wasm_f32x4_pmin(vacc0x89AB, vmax); + vacc0xCDEF = wasm_f32x4_pmin(vacc0xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c0 + 8, vacc0x89AB); + wasm_v128_store(c0 + 12, vacc0xCDEF); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x4567); + vacc0x4567 = vacc0xCDEF; + c0 += 4; + } + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot-u2.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot-u2.c new file mode 100644 index 00000000000..dad2ed9dbf1 --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot-u2.c @@ -0,0 +1,423 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + v128_t vacc0x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point0); + v128_t vacc0x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point0); + v128_t vacc0x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point0); + v128_t vacc0x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point0); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + v128_t vacc1x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point1); + v128_t vacc1x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point1); + v128_t vacc1x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point1); + v128_t vacc1x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point1); + const v128_t vinput_zero_point2 = wasm_i32x4_splat((int32_t) quantization_params[2].zero_point + 128); + v128_t vacc2x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point2); + v128_t vacc2x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point2); + v128_t vacc2x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point2); + v128_t vacc2x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point2); + const v128_t vinput_zero_point3 = wasm_i32x4_splat((int32_t) quantization_params[3].zero_point + 128); + v128_t vacc3x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point3); + v128_t vacc3x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point3); + v128_t vacc3x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point3); + v128_t vacc3x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point3); + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + va3x0x0123 = wasm_v128_xor(va3x0x0123, vsign_mask); + va3x1x0123 = wasm_v128_xor(va3x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + const v128_t vinput_scale3 = wasm_v128_load32_splat(&quantization_params[3].inv_scale); + + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vinput_scale0); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vinput_scale0); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vinput_scale0); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vinput_scale0); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vinput_scale1); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vinput_scale1); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vinput_scale1); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vinput_scale1); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vinput_scale2); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vinput_scale2); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vinput_scale2); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vinput_scale2); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vinput_scale3); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vinput_scale3); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vinput_scale3); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vinput_scale3); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vfilter_output_scale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vfilter_output_scale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vfilter_output_scale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vfilter_output_scaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vfilter_output_scale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vfilter_output_scale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vfilter_output_scale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vfilter_output_scaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vfilter_output_scale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vfilter_output_scale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vfilter_output_scale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vfilter_output_scaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vfilter_output_scale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vfilter_output_scale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vfilter_output_scale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vbias0123); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vbias4567); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vbias89AB); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vbiasCDEF); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vbias0123); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vbias4567); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vbias89AB); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vbiasCDEF); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vbias0123); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vbias4567); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vbias89AB); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vbiasCDEF); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vbias0123); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vbias4567); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vbias89AB); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vbiasCDEF); + + vacc0x0x0123 = wasm_f32x4_pmax(vacc0x0x0123, vmin); + vacc0x0x4567 = wasm_f32x4_pmax(vacc0x0x4567, vmin); + vacc0x0x89AB = wasm_f32x4_pmax(vacc0x0x89AB, vmin); + vacc0x0xCDEF = wasm_f32x4_pmax(vacc0x0xCDEF, vmin); + vacc1x0x0123 = wasm_f32x4_pmax(vacc1x0x0123, vmin); + vacc1x0x4567 = wasm_f32x4_pmax(vacc1x0x4567, vmin); + vacc1x0x89AB = wasm_f32x4_pmax(vacc1x0x89AB, vmin); + vacc1x0xCDEF = wasm_f32x4_pmax(vacc1x0xCDEF, vmin); + vacc2x0x0123 = wasm_f32x4_pmax(vacc2x0x0123, vmin); + vacc2x0x4567 = wasm_f32x4_pmax(vacc2x0x4567, vmin); + vacc2x0x89AB = wasm_f32x4_pmax(vacc2x0x89AB, vmin); + vacc2x0xCDEF = wasm_f32x4_pmax(vacc2x0xCDEF, vmin); + vacc3x0x0123 = wasm_f32x4_pmax(vacc3x0x0123, vmin); + vacc3x0x4567 = wasm_f32x4_pmax(vacc3x0x4567, vmin); + vacc3x0x89AB = wasm_f32x4_pmax(vacc3x0x89AB, vmin); + vacc3x0xCDEF = wasm_f32x4_pmax(vacc3x0xCDEF, vmin); + + vacc0x0x0123 = wasm_f32x4_pmin(vacc0x0x0123, vmax); + vacc0x0x4567 = wasm_f32x4_pmin(vacc0x0x4567, vmax); + vacc0x0x89AB = wasm_f32x4_pmin(vacc0x0x89AB, vmax); + vacc0x0xCDEF = wasm_f32x4_pmin(vacc0x0xCDEF, vmax); + vacc1x0x0123 = wasm_f32x4_pmin(vacc1x0x0123, vmax); + vacc1x0x4567 = wasm_f32x4_pmin(vacc1x0x4567, vmax); + vacc1x0x89AB = wasm_f32x4_pmin(vacc1x0x89AB, vmax); + vacc1x0xCDEF = wasm_f32x4_pmin(vacc1x0xCDEF, vmax); + vacc2x0x0123 = wasm_f32x4_pmin(vacc2x0x0123, vmax); + vacc2x0x4567 = wasm_f32x4_pmin(vacc2x0x4567, vmax); + vacc2x0x89AB = wasm_f32x4_pmin(vacc2x0x89AB, vmax); + vacc2x0xCDEF = wasm_f32x4_pmin(vacc2x0xCDEF, vmax); + vacc3x0x0123 = wasm_f32x4_pmin(vacc3x0x0123, vmax); + vacc3x0x4567 = wasm_f32x4_pmin(vacc3x0x4567, vmax); + vacc3x0x89AB = wasm_f32x4_pmin(vacc3x0x89AB, vmax); + vacc3x0xCDEF = wasm_f32x4_pmin(vacc3x0xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c0, vacc0x0x0123); + wasm_v128_store(c0 + 4, vacc0x0x4567); + wasm_v128_store(c0 + 8, vacc0x0x89AB); + wasm_v128_store(c0 + 12, vacc0x0xCDEF); + wasm_v128_store(c1, vacc1x0x0123); + wasm_v128_store(c1 + 4, vacc1x0x4567); + wasm_v128_store(c1 + 8, vacc1x0x89AB); + wasm_v128_store(c1 + 12, vacc1x0xCDEF); + wasm_v128_store(c2, vacc2x0x0123); + wasm_v128_store(c2 + 4, vacc2x0x4567); + wasm_v128_store(c2 + 8, vacc2x0x89AB); + wasm_v128_store(c2 + 12, vacc2x0xCDEF); + wasm_v128_store(c3, vacc3x0x0123); + wasm_v128_store(c3 + 4, vacc3x0x4567); + wasm_v128_store(c3 + 8, vacc3x0x89AB); + wasm_v128_store(c3 + 12, vacc3x0xCDEF); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x0x4567); + vacc0x0x4567 = vacc0x0xCDEF; + c0 += 4; + wasm_v128_store(c1, vacc1x0x0123); + vacc1x0x0123 = vacc1x0x89AB; + c1 += 4; + wasm_v128_store(c1, vacc1x0x4567); + vacc1x0x4567 = vacc1x0xCDEF; + c1 += 4; + wasm_v128_store(c2, vacc2x0x0123); + vacc2x0x0123 = vacc2x0x89AB; + c2 += 4; + wasm_v128_store(c2, vacc2x0x4567); + vacc2x0x4567 = vacc2x0xCDEF; + c2 += 4; + wasm_v128_store(c3, vacc3x0x0123); + vacc3x0x0123 = vacc3x0x89AB; + c3 += 4; + wasm_v128_store(c3, vacc3x0x4567); + vacc3x0x4567 = vacc3x0xCDEF; + c3 += 4; + } + if (nc & 4) { + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0x0123); + vacc1x0x0123 = vacc1x0x4567; + c1 += 4; + wasm_v128_store(c2, vacc2x0x0123); + vacc2x0x0123 = vacc2x0x4567; + c2 += 4; + wasm_v128_store(c3, vacc3x0x0123); + vacc3x0x0123 = vacc3x0x4567; + c3 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0x0123, 0); + vacc0x0x0123 = wasm_v64x2_shuffle(vacc0x0x0123, vacc0x0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0x0123, 0); + vacc1x0x0123 = wasm_v64x2_shuffle(vacc1x0x0123, vacc1x0x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0x0123, 0); + vacc2x0x0123 = wasm_v64x2_shuffle(vacc2x0x0123, vacc2x0x0123, 1, 1); + c2 += 2; + wasm_v128_store64_lane(c3, vacc3x0x0123, 0); + vacc3x0x0123 = wasm_v64x2_shuffle(vacc3x0x0123, vacc3x0x0123, 1, 1); + c3 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0x0123, 0); + wasm_v128_store32_lane(c3, vacc3x0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot.c b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot.c new file mode 100644 index 00000000000..feb705f723c --- /dev/null +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16c4-minmax-wasmusdot.c @@ -0,0 +1,355 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + float* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128); + v128_t vacc0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point0); + v128_t vacc0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point0); + v128_t vacc0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point0); + v128_t vacc0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point0); + const v128_t vinput_zero_point1 = wasm_i32x4_splat((int32_t) quantization_params[1].zero_point + 128); + v128_t vacc1x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point1); + v128_t vacc1x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point1); + v128_t vacc1x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point1); + v128_t vacc1xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point1); + const v128_t vinput_zero_point2 = wasm_i32x4_splat((int32_t) quantization_params[2].zero_point + 128); + v128_t vacc2x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point2); + v128_t vacc2x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point2); + v128_t vacc2x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point2); + v128_t vacc2xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point2); + const v128_t vinput_zero_point3 = wasm_i32x4_splat((int32_t) quantization_params[3].zero_point + 128); + v128_t vacc3x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point3); + v128_t vacc3x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point3); + v128_t vacc3x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point3); + v128_t vacc3xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point3); + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + vacc3x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0123); + vacc3x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x4567); + vacc3x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x89AB); + vacc3xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + vacc3x89AB = wasm_f32x4_convert_i32x4(vacc3x89AB); + vacc3xCDEF = wasm_f32x4_convert_i32x4(vacc3xCDEF); + + const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale); + const v128_t vinput_scale1 = wasm_v128_load32_splat(&quantization_params[1].inv_scale); + const v128_t vinput_scale2 = wasm_v128_load32_splat(&quantization_params[2].inv_scale); + const v128_t vinput_scale3 = wasm_v128_load32_splat(&quantization_params[3].inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale0); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vinput_scale0); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vinput_scale0); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale1); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale1); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vinput_scale1); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vinput_scale1); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale2); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vinput_scale2); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vinput_scale2); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vinput_scale2); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vinput_scale3); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vinput_scale3); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vinput_scale3); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vinput_scale3); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vfilter_output_scale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vfilter_output_scaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vfilter_output_scale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vfilter_output_scaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vfilter_output_scale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vfilter_output_scale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vfilter_output_scaleCDEF); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vfilter_output_scale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vfilter_output_scale4567); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vfilter_output_scale89AB); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vbias89AB); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vbiasCDEF); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vbias89AB); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vbiasCDEF); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vbias4567); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vbias89AB); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vbiasCDEF); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vbias0123); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vbias4567); + vacc3x89AB = wasm_f32x4_add(vacc3x89AB, vbias89AB); + vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, vbiasCDEF); + + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc0x89AB = wasm_f32x4_pmax(vacc0x89AB, vmin); + vacc0xCDEF = wasm_f32x4_pmax(vacc0xCDEF, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + vacc1x89AB = wasm_f32x4_pmax(vacc1x89AB, vmin); + vacc1xCDEF = wasm_f32x4_pmax(vacc1xCDEF, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + vacc2x4567 = wasm_f32x4_pmax(vacc2x4567, vmin); + vacc2x89AB = wasm_f32x4_pmax(vacc2x89AB, vmin); + vacc2xCDEF = wasm_f32x4_pmax(vacc2xCDEF, vmin); + vacc3x0123 = wasm_f32x4_pmax(vacc3x0123, vmin); + vacc3x4567 = wasm_f32x4_pmax(vacc3x4567, vmin); + vacc3x89AB = wasm_f32x4_pmax(vacc3x89AB, vmin); + vacc3xCDEF = wasm_f32x4_pmax(vacc3xCDEF, vmin); + + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc0x89AB = wasm_f32x4_pmin(vacc0x89AB, vmax); + vacc0xCDEF = wasm_f32x4_pmin(vacc0xCDEF, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + vacc1x89AB = wasm_f32x4_pmin(vacc1x89AB, vmax); + vacc1xCDEF = wasm_f32x4_pmin(vacc1xCDEF, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + vacc2x4567 = wasm_f32x4_pmin(vacc2x4567, vmax); + vacc2x89AB = wasm_f32x4_pmin(vacc2x89AB, vmax); + vacc2xCDEF = wasm_f32x4_pmin(vacc2xCDEF, vmax); + vacc3x0123 = wasm_f32x4_pmin(vacc3x0123, vmax); + vacc3x4567 = wasm_f32x4_pmin(vacc3x4567, vmax); + vacc3x89AB = wasm_f32x4_pmin(vacc3x89AB, vmax); + vacc3xCDEF = wasm_f32x4_pmin(vacc3xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c0 + 8, vacc0x89AB); + wasm_v128_store(c0 + 12, vacc0xCDEF); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + wasm_v128_store(c1 + 8, vacc1x89AB); + wasm_v128_store(c1 + 12, vacc1xCDEF); + wasm_v128_store(c2, vacc2x0123); + wasm_v128_store(c2 + 4, vacc2x4567); + wasm_v128_store(c2 + 8, vacc2x89AB); + wasm_v128_store(c2 + 12, vacc2xCDEF); + wasm_v128_store(c3, vacc3x0123); + wasm_v128_store(c3 + 4, vacc3x4567); + wasm_v128_store(c3 + 8, vacc3x89AB); + wasm_v128_store(c3 + 12, vacc3xCDEF); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x4567); + vacc0x4567 = vacc0xCDEF; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x89AB; + c1 += 4; + wasm_v128_store(c1, vacc1x4567); + vacc1x4567 = vacc1xCDEF; + c1 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x89AB; + c2 += 4; + wasm_v128_store(c2, vacc2x4567); + vacc2x4567 = vacc2xCDEF; + c2 += 4; + wasm_v128_store(c3, vacc3x0123); + vacc3x0123 = vacc3x89AB; + c3 += 4; + wasm_v128_store(c3, vacc3x4567); + vacc3x4567 = vacc3xCDEF; + c3 += 4; + } + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x4567; + c2 += 4; + wasm_v128_store(c3, vacc3x0123); + vacc3x0123 = vacc3x4567; + c3 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + wasm_v128_store64_lane(c3, vacc3x0123, 0); + vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); + c3 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + wasm_v128_store32_lane(c3, vacc3x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot-u2.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot-u2.c new file mode 100644 index 00000000000..b415bdcb578 --- /dev/null +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot-u2.c @@ -0,0 +1,215 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const int8_t* zero_data, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + float* c0 = c; + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point = wasm_i32x4_splat((int32_t) quantization_params->zero_point + 128); + v128_t vacc0x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc0x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc0x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc0x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } else { + a0 = zero_data; + } + a += 1; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vinput_scale = wasm_v128_load32_splat(&quantization_params->inv_scale); + + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vinput_scale); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vinput_scale); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vinput_scale); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vinput_scale); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vfilter_output_scale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vfilter_output_scale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vfilter_output_scale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vbias0123); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vbias4567); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vbias89AB); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vbiasCDEF); + + vacc0x0x0123 = wasm_f32x4_pmax(vacc0x0x0123, vmin); + vacc0x0x4567 = wasm_f32x4_pmax(vacc0x0x4567, vmin); + vacc0x0x89AB = wasm_f32x4_pmax(vacc0x0x89AB, vmin); + vacc0x0xCDEF = wasm_f32x4_pmax(vacc0x0xCDEF, vmin); + + vacc0x0x0123 = wasm_f32x4_pmin(vacc0x0x0123, vmax); + vacc0x0x4567 = wasm_f32x4_pmin(vacc0x0x4567, vmax); + vacc0x0x89AB = wasm_f32x4_pmin(vacc0x0x89AB, vmax); + vacc0x0xCDEF = wasm_f32x4_pmin(vacc0x0xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c0, vacc0x0x0123); + wasm_v128_store(c0 + 4, vacc0x0x4567); + wasm_v128_store(c0 + 8, vacc0x0x89AB); + wasm_v128_store(c0 + 12, vacc0x0xCDEF); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x0x4567); + vacc0x0x4567 = vacc0x0xCDEF; + c0 += 4; + } + if (nc & 4) { + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0x0123, 0); + vacc0x0x0123 = wasm_v64x2_shuffle(vacc0x0x0123, vacc0x0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot.c new file mode 100644 index 00000000000..23f45391c78 --- /dev/null +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x16c4-minmax-wasmusdot.c @@ -0,0 +1,186 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const int8_t* zero_data, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + float* c0 = c; + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point = wasm_i32x4_splat((int32_t) quantization_params->zero_point + 128); + v128_t vacc0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } else { + a0 = zero_data; + } + a += 1; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + + const v128_t vinput_scale = wasm_v128_load32_splat(&quantization_params->inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vinput_scale); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vinput_scale); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vfilter_output_scale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vbias89AB); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vbiasCDEF); + + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc0x89AB = wasm_f32x4_pmax(vacc0x89AB, vmin); + vacc0xCDEF = wasm_f32x4_pmax(vacc0xCDEF, vmin); + + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc0x89AB = wasm_f32x4_pmin(vacc0x89AB, vmax); + vacc0xCDEF = wasm_f32x4_pmin(vacc0xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c0 + 8, vacc0x89AB); + wasm_v128_store(c0 + 12, vacc0xCDEF); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x4567); + vacc0x4567 = vacc0xCDEF; + c0 += 4; + } + if (nc & 4) { + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c0, vacc0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c new file mode 100644 index 00000000000..ae0a57ffacb --- /dev/null +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot-u2.c @@ -0,0 +1,443 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const int8_t* zero_data, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point = wasm_i32x4_splat((int32_t) quantization_params->zero_point + 128); + v128_t vacc0x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc0x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc0x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc0x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + v128_t vacc1x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc1x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc1x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc1x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + v128_t vacc2x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc2x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc2x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc2x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + v128_t vacc3x0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc3x0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc3x0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc3x0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } else { + a0 = zero_data; + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } else { + a1 = zero_data; + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } else { + a2 = zero_data; + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } else { + a3 = zero_data; + } + a += 4; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + va3x0x0123 = wasm_v128_xor(va3x0x0123, vsign_mask); + va3x1x0123 = wasm_v128_xor(va3x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vinput_scale = wasm_v128_load32_splat(&quantization_params->inv_scale); + + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vinput_scale); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vinput_scale); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vinput_scale); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vinput_scale); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vinput_scale); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vinput_scale); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vinput_scale); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vinput_scale); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vinput_scale); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vinput_scale); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vinput_scale); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vinput_scale); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vinput_scale); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vinput_scale); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vinput_scale); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vinput_scale); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vfilter_output_scale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vfilter_output_scale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vfilter_output_scale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vfilter_output_scaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vfilter_output_scale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vfilter_output_scale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vfilter_output_scale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vfilter_output_scaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vfilter_output_scale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vfilter_output_scale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vfilter_output_scale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vfilter_output_scaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vfilter_output_scale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vfilter_output_scale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vfilter_output_scale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vbias0123); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vbias4567); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vbias89AB); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vbiasCDEF); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vbias0123); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vbias4567); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vbias89AB); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vbiasCDEF); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vbias0123); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vbias4567); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vbias89AB); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vbiasCDEF); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vbias0123); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vbias4567); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vbias89AB); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vbiasCDEF); + + vacc0x0x0123 = wasm_f32x4_pmax(vacc0x0x0123, vmin); + vacc0x0x4567 = wasm_f32x4_pmax(vacc0x0x4567, vmin); + vacc0x0x89AB = wasm_f32x4_pmax(vacc0x0x89AB, vmin); + vacc0x0xCDEF = wasm_f32x4_pmax(vacc0x0xCDEF, vmin); + vacc1x0x0123 = wasm_f32x4_pmax(vacc1x0x0123, vmin); + vacc1x0x4567 = wasm_f32x4_pmax(vacc1x0x4567, vmin); + vacc1x0x89AB = wasm_f32x4_pmax(vacc1x0x89AB, vmin); + vacc1x0xCDEF = wasm_f32x4_pmax(vacc1x0xCDEF, vmin); + vacc2x0x0123 = wasm_f32x4_pmax(vacc2x0x0123, vmin); + vacc2x0x4567 = wasm_f32x4_pmax(vacc2x0x4567, vmin); + vacc2x0x89AB = wasm_f32x4_pmax(vacc2x0x89AB, vmin); + vacc2x0xCDEF = wasm_f32x4_pmax(vacc2x0xCDEF, vmin); + vacc3x0x0123 = wasm_f32x4_pmax(vacc3x0x0123, vmin); + vacc3x0x4567 = wasm_f32x4_pmax(vacc3x0x4567, vmin); + vacc3x0x89AB = wasm_f32x4_pmax(vacc3x0x89AB, vmin); + vacc3x0xCDEF = wasm_f32x4_pmax(vacc3x0xCDEF, vmin); + + vacc0x0x0123 = wasm_f32x4_pmin(vacc0x0x0123, vmax); + vacc0x0x4567 = wasm_f32x4_pmin(vacc0x0x4567, vmax); + vacc0x0x89AB = wasm_f32x4_pmin(vacc0x0x89AB, vmax); + vacc0x0xCDEF = wasm_f32x4_pmin(vacc0x0xCDEF, vmax); + vacc1x0x0123 = wasm_f32x4_pmin(vacc1x0x0123, vmax); + vacc1x0x4567 = wasm_f32x4_pmin(vacc1x0x4567, vmax); + vacc1x0x89AB = wasm_f32x4_pmin(vacc1x0x89AB, vmax); + vacc1x0xCDEF = wasm_f32x4_pmin(vacc1x0xCDEF, vmax); + vacc2x0x0123 = wasm_f32x4_pmin(vacc2x0x0123, vmax); + vacc2x0x4567 = wasm_f32x4_pmin(vacc2x0x4567, vmax); + vacc2x0x89AB = wasm_f32x4_pmin(vacc2x0x89AB, vmax); + vacc2x0xCDEF = wasm_f32x4_pmin(vacc2x0xCDEF, vmax); + vacc3x0x0123 = wasm_f32x4_pmin(vacc3x0x0123, vmax); + vacc3x0x4567 = wasm_f32x4_pmin(vacc3x0x4567, vmax); + vacc3x0x89AB = wasm_f32x4_pmin(vacc3x0x89AB, vmax); + vacc3x0xCDEF = wasm_f32x4_pmin(vacc3x0xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c3, vacc3x0x0123); + wasm_v128_store(c3 + 4, vacc3x0x4567); + wasm_v128_store(c3 + 8, vacc3x0x89AB); + wasm_v128_store(c3 + 12, vacc3x0xCDEF); + wasm_v128_store(c2, vacc2x0x0123); + wasm_v128_store(c2 + 4, vacc2x0x4567); + wasm_v128_store(c2 + 8, vacc2x0x89AB); + wasm_v128_store(c2 + 12, vacc2x0xCDEF); + wasm_v128_store(c1, vacc1x0x0123); + wasm_v128_store(c1 + 4, vacc1x0x4567); + wasm_v128_store(c1 + 8, vacc1x0x89AB); + wasm_v128_store(c1 + 12, vacc1x0xCDEF); + wasm_v128_store(c0, vacc0x0x0123); + wasm_v128_store(c0 + 4, vacc0x0x4567); + wasm_v128_store(c0 + 8, vacc0x0x89AB); + wasm_v128_store(c0 + 12, vacc0x0xCDEF); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c3, vacc3x0x0123); + vacc3x0x0123 = vacc3x0x89AB; + c3 += 4; + wasm_v128_store(c3, vacc3x0x4567); + vacc3x0x4567 = vacc3x0xCDEF; + c3 += 4; + wasm_v128_store(c2, vacc2x0x0123); + vacc2x0x0123 = vacc2x0x89AB; + c2 += 4; + wasm_v128_store(c2, vacc2x0x4567); + vacc2x0x4567 = vacc2x0xCDEF; + c2 += 4; + wasm_v128_store(c1, vacc1x0x0123); + vacc1x0x0123 = vacc1x0x89AB; + c1 += 4; + wasm_v128_store(c1, vacc1x0x4567); + vacc1x0x4567 = vacc1x0xCDEF; + c1 += 4; + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x0x4567); + vacc0x0x4567 = vacc0x0xCDEF; + c0 += 4; + } + if (nc & 4) { + wasm_v128_store(c3, vacc3x0x0123); + vacc3x0x0123 = vacc3x0x4567; + c3 += 4; + wasm_v128_store(c2, vacc2x0x0123); + vacc2x0x0123 = vacc2x0x4567; + c2 += 4; + wasm_v128_store(c1, vacc1x0x0123); + vacc1x0x0123 = vacc1x0x4567; + c1 += 4; + wasm_v128_store(c0, vacc0x0x0123); + vacc0x0x0123 = vacc0x0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c3, vacc3x0x0123, 0); + vacc3x0x0123 = wasm_v64x2_shuffle(vacc3x0x0123, vacc3x0x0123, 1, 1); + c3 += 2; + wasm_v128_store64_lane(c2, vacc2x0x0123, 0); + vacc2x0x0123 = wasm_v64x2_shuffle(vacc2x0x0123, vacc2x0x0123, 1, 1); + c2 += 2; + wasm_v128_store64_lane(c1, vacc1x0x0123, 0); + vacc1x0x0123 = wasm_v64x2_shuffle(vacc1x0x0123, vacc1x0x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c0, vacc0x0x0123, 0); + vacc0x0x0123 = wasm_v64x2_shuffle(vacc0x0x0123, vacc0x0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c3, vacc3x0x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0x0123, 0); + wasm_v128_store32_lane(c0, vacc0x0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c new file mode 100644 index 00000000000..c43da1fd7c2 --- /dev/null +++ b/src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c4-minmax-wasmusdot.c @@ -0,0 +1,375 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const int8_t* zero_data, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vksum0123 = wasm_v128_load((const int32_t*) w); + v128_t vksum4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vksum89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vksumCDEF = wasm_v128_load((const int32_t*) w + 12); + const v128_t vinput_zero_point = wasm_i32x4_splat((int32_t) quantization_params->zero_point + 128); + v128_t vacc0x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc0x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc0x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc0xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + v128_t vacc1x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc1x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc1x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc1xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + v128_t vacc2x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc2x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc2x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc2xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + v128_t vacc3x0123 = wasm_i32x4_mul(vksum0123, vinput_zero_point); + v128_t vacc3x4567 = wasm_i32x4_mul(vksum4567, vinput_zero_point); + v128_t vacc3x89AB = wasm_i32x4_mul(vksum89AB, vinput_zero_point); + v128_t vacc3xCDEF = wasm_i32x4_mul(vksumCDEF, vinput_zero_point); + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } else { + a0 = zero_data; + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } else { + a1 = zero_data; + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } else { + a2 = zero_data; + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } else { + a3 = zero_data; + } + a += 4; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + vacc3x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0123); + vacc3x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x4567); + vacc3x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x89AB); + vacc3xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + vacc3x89AB = wasm_f32x4_convert_i32x4(vacc3x89AB); + vacc3xCDEF = wasm_f32x4_convert_i32x4(vacc3xCDEF); + + const v128_t vinput_scale = wasm_v128_load32_splat(&quantization_params->inv_scale); + + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vinput_scale); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vinput_scale); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vinput_scale); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vinput_scale); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vinput_scale); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vinput_scale); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vinput_scale); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vinput_scale); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vinput_scale); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vinput_scale); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vinput_scale); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vinput_scale); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vinput_scale); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vinput_scale); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vinput_scale); + + const v128_t vfilter_output_scale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vfilter_output_scaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vfilter_output_scale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vfilter_output_scale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vfilter_output_scaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vfilter_output_scale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vfilter_output_scale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vfilter_output_scale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vfilter_output_scaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vfilter_output_scale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vfilter_output_scale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vfilter_output_scale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vfilter_output_scaleCDEF); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vfilter_output_scale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vfilter_output_scale4567); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vfilter_output_scale89AB); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vfilter_output_scaleCDEF); + + const v128_t vbias0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbias89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vbiasCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vbias4567); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vbias89AB); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vbiasCDEF); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vbias0123); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vbias4567); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vbias89AB); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vbiasCDEF); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vbias0123); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vbias4567); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vbias89AB); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vbiasCDEF); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vbias0123); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vbias4567); + vacc3x89AB = wasm_f32x4_add(vacc3x89AB, vbias89AB); + vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, vbiasCDEF); + + vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin); + vacc0x4567 = wasm_f32x4_pmax(vacc0x4567, vmin); + vacc0x89AB = wasm_f32x4_pmax(vacc0x89AB, vmin); + vacc0xCDEF = wasm_f32x4_pmax(vacc0xCDEF, vmin); + vacc1x0123 = wasm_f32x4_pmax(vacc1x0123, vmin); + vacc1x4567 = wasm_f32x4_pmax(vacc1x4567, vmin); + vacc1x89AB = wasm_f32x4_pmax(vacc1x89AB, vmin); + vacc1xCDEF = wasm_f32x4_pmax(vacc1xCDEF, vmin); + vacc2x0123 = wasm_f32x4_pmax(vacc2x0123, vmin); + vacc2x4567 = wasm_f32x4_pmax(vacc2x4567, vmin); + vacc2x89AB = wasm_f32x4_pmax(vacc2x89AB, vmin); + vacc2xCDEF = wasm_f32x4_pmax(vacc2xCDEF, vmin); + vacc3x0123 = wasm_f32x4_pmax(vacc3x0123, vmin); + vacc3x4567 = wasm_f32x4_pmax(vacc3x4567, vmin); + vacc3x89AB = wasm_f32x4_pmax(vacc3x89AB, vmin); + vacc3xCDEF = wasm_f32x4_pmax(vacc3xCDEF, vmin); + + vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax); + vacc0x4567 = wasm_f32x4_pmin(vacc0x4567, vmax); + vacc0x89AB = wasm_f32x4_pmin(vacc0x89AB, vmax); + vacc0xCDEF = wasm_f32x4_pmin(vacc0xCDEF, vmax); + vacc1x0123 = wasm_f32x4_pmin(vacc1x0123, vmax); + vacc1x4567 = wasm_f32x4_pmin(vacc1x4567, vmax); + vacc1x89AB = wasm_f32x4_pmin(vacc1x89AB, vmax); + vacc1xCDEF = wasm_f32x4_pmin(vacc1xCDEF, vmax); + vacc2x0123 = wasm_f32x4_pmin(vacc2x0123, vmax); + vacc2x4567 = wasm_f32x4_pmin(vacc2x4567, vmax); + vacc2x89AB = wasm_f32x4_pmin(vacc2x89AB, vmax); + vacc2xCDEF = wasm_f32x4_pmin(vacc2xCDEF, vmax); + vacc3x0123 = wasm_f32x4_pmin(vacc3x0123, vmax); + vacc3x4567 = wasm_f32x4_pmin(vacc3x4567, vmax); + vacc3x89AB = wasm_f32x4_pmin(vacc3x89AB, vmax); + vacc3xCDEF = wasm_f32x4_pmin(vacc3xCDEF, vmax); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store(c3, vacc3x0123); + wasm_v128_store(c3 + 4, vacc3x4567); + wasm_v128_store(c3 + 8, vacc3x89AB); + wasm_v128_store(c3 + 12, vacc3xCDEF); + wasm_v128_store(c2, vacc2x0123); + wasm_v128_store(c2 + 4, vacc2x4567); + wasm_v128_store(c2 + 8, vacc2x89AB); + wasm_v128_store(c2 + 12, vacc2xCDEF); + wasm_v128_store(c1, vacc1x0123); + wasm_v128_store(c1 + 4, vacc1x4567); + wasm_v128_store(c1 + 8, vacc1x89AB); + wasm_v128_store(c1 + 12, vacc1xCDEF); + wasm_v128_store(c0, vacc0x0123); + wasm_v128_store(c0 + 4, vacc0x4567); + wasm_v128_store(c0 + 8, vacc0x89AB); + wasm_v128_store(c0 + 12, vacc0xCDEF); + + c0 = (float*) ((uintptr_t) c0 + cn_stride); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store(c3, vacc3x0123); + vacc3x0123 = vacc3x89AB; + c3 += 4; + wasm_v128_store(c3, vacc3x4567); + vacc3x4567 = vacc3xCDEF; + c3 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x89AB; + c2 += 4; + wasm_v128_store(c2, vacc2x4567); + vacc2x4567 = vacc2xCDEF; + c2 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x89AB; + c1 += 4; + wasm_v128_store(c1, vacc1x4567); + vacc1x4567 = vacc1xCDEF; + c1 += 4; + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x89AB; + c0 += 4; + wasm_v128_store(c0, vacc0x4567); + vacc0x4567 = vacc0xCDEF; + c0 += 4; + } + if (nc & 4) { + wasm_v128_store(c3, vacc3x0123); + vacc3x0123 = vacc3x4567; + c3 += 4; + wasm_v128_store(c2, vacc2x0123); + vacc2x0123 = vacc2x4567; + c2 += 4; + wasm_v128_store(c1, vacc1x0123); + vacc1x0123 = vacc1x4567; + c1 += 4; + wasm_v128_store(c0, vacc0x0123); + vacc0x0123 = vacc0x4567; + c0 += 4; + } + if (nc & 2) { + wasm_v128_store64_lane(c3, vacc3x0123, 0); + vacc3x0123 = wasm_v64x2_shuffle(vacc3x0123, vacc3x0123, 1, 1); + c3 += 2; + wasm_v128_store64_lane(c2, vacc2x0123, 0); + vacc2x0123 = wasm_v64x2_shuffle(vacc2x0123, vacc2x0123, 1, 1); + c2 += 2; + wasm_v128_store64_lane(c1, vacc1x0123, 0); + vacc1x0123 = wasm_v64x2_shuffle(vacc1x0123, vacc1x0123, 1, 1); + c1 += 2; + wasm_v128_store64_lane(c0, vacc0x0123, 0); + vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1); + c0 += 2; + } + if (nc & 1) { + wasm_v128_store32_lane(c3, vacc3x0123, 0); + wasm_v128_store32_lane(c2, vacc2x0123, 0); + wasm_v128_store32_lane(c1, vacc1x0123, 0); + wasm_v128_store32_lane(c0, vacc0x0123, 0); + } + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-gemm/MRx16c4-wasmdot.c.in b/src/qs8-gemm/MRx16c4-wasmdot.c.in new file mode 100644 index 00000000000..26f7ed65248 --- /dev/null +++ b/src/qs8-gemm/MRx16c4-wasmdot.c.in @@ -0,0 +1,356 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert DATATYPE in ["QC8", "QD8", "QS8"] +$assert DATATYPE == "QD8" or REQUANTIZATION == "FP32" +$assert DATATYPE != "QD8" or not REQUANTIZATION +$assert NR == 16 +$assert (SDOT == 0 and (DATATYPE in ["QC8", "QD8"])) or SDOT == 1 +$UNROLL = locals().get("UNROLL", 1) +$assert ACCUMULATORS == UNROLL or ACCUMULATORS == 1 +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +$DATATYPE_SPEC = {"QC8": "qs8_qc8w", "QD8": "qd8_f32_qc8w", "QS8": "qs8"}[DATATYPE] +$REQUANTIZATION_SPEC = "_" + REQUANTIZATION.lower() if REQUANTIZATION else "" +$PARAMS_TYPE = {"QC8": "union xnn_qs8_qc8w_conv_minmax_params", "QD8": "union xnn_f32_minmax_params", "QS8": "union xnn_qs8_conv_minmax_params"}[DATATYPE] +$OUT_T = "float" if DATATYPE == "QD8" else "int8_t" +$def VACC(M,K=0): +$ return f"vacc{M}x{K}" if UNROLL > 1 else f"vacc{M}" +$ACC_POSTFIX=f"_acc{ACCUMULATORS}" if ACCUMULATORS > 1 else "" +void xnn_${DATATYPE_SPEC}_gemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x${NR}c4__wasm${"sdot" if SDOT else "usdot"}${"_u" + str(UNROLL) if UNROLL > 1 else ""}${ACC_POSTFIX}( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + ${OUT_T}* restrict c, + size_t cm_stride, + size_t cn_stride, + $if DATATYPE == "QD8": + const ${PARAMS_TYPE} params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + $else: + const ${PARAMS_TYPE} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= ${MR}); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + ${OUT_T}* c0 = c; + $for M in range(1, MR): + const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride); + ${OUT_T}* c${M} = (${OUT_T}*) ((uintptr_t) c${M-1} + cm_stride); + $if M % 2 == 0: + if XNN_UNPREDICTABLE(mr <= ${M}) { + a${M} = a${M-1}; + c${M} = c${M-1}; + } + $elif M + 1 == MR: + if XNN_UNPREDICTABLE(mr != ${M+1}) { + a${M} = a${M-1}; + c${M} = c${M-1}; + } + $else: + if XNN_UNPREDICTABLE(mr < ${M+1}) { + a${M} = a${M-1}; + c${M} = c${M-1}; + } + + $if DATATYPE == "QD8": + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + $else: + $if DATATYPE != "QC8": + const v128_t vscale = wasm_v128_load32_splat(¶ms->fp32_scalar.scale); + XNN_FORCE_REALIZATION(vscale); + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + $if not SDOT: + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + $if DATATYPE == "QD8": + v128_t vksum${ABC[0:4]} = wasm_v128_load((const int32_t*) w); + $for N in range(4, NR, 4): + v128_t vksum${ABC[N:N+4]} = wasm_v128_load((const int32_t*) w + ${N}); + $for M in range(MR): + $if SDOT: + const v128_t vinput_zero_point${M} = wasm_v128_load32_splat(&quantization_params[${M}].zero_point); + $else: + const v128_t vinput_zero_point${M} = wasm_i32x4_splat((int32_t) quantization_params[${M}].zero_point + 128); + $for N in range(0, NR, 4): + v128_t ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_mul(vksum${ABC[N:N+4]}, vinput_zero_point${M}); + $if ACCUMULATORS > 1: + $for K in range(1, UNROLL): + v128_t ${VACC(M, K)}x${ABC[N:N+4]} = wasm_u32x4_const(0, 0, 0, 0); + $else: + v128_t ${VACC(0)}x0123 = wasm_v128_load(w); + $for N in range(4, NR, 4): + v128_t ${VACC(0)}x${ABC[N:N+4]} = wasm_v128_load((const int32_t*) w + ${N}); + $if ACCUMULATORS > 1: + $for K in range(1, UNROLL): + $for N in range(0, NR, 4): + v128_t ${VACC(0, K)}x${ABC[N:N+4]} = wasm_u32x4_const(0, 0, 0, 0); + $for M in range(1, MR, 1): + $for N in range(0, NR, 4): + $if ACCUMULATORS > 1: + $for K in range(UNROLL): + v128_t ${VACC(M, K)}x${ABC[N:N+4]}= ${VACC(0, K)}x${ABC[N:N+4]}; + $else: + v128_t ${VACC(M)}x${ABC[N:N+4]}= ${VACC(0)}x${ABC[N:N+4]}; + w = (const int32_t*) w + ${NR}; + + size_t k = kc; + $if UNROLL > 1: + while (k >= ${UNROLL * 4} * sizeof(int8_t)) { + $for M in range(MR): + v128_t va${M}x0x0123 = wasm_v128_load32_splat(a${M}); + $for K in range(1, UNROLL): + v128_t va${M}x${K}x0123 = wasm_v128_load32_splat(a${M} + ${4 * K}); + a${M} += ${4 * UNROLL}; + + $if not SDOT: + $for M in range(MR): + $for K in range(UNROLL): + va${M}x${K}x0123 = wasm_v128_xor(va${M}x${K}x0123, vsign_mask); + + $for K in range(UNROLL): + $for N in range(0, NR, 4): + $if N == 0 and K == 0: + const v128_t vb${K}x${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w); + $else: + const v128_t vb${K}x${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w + ${4 * N + 4 * NR * K}); + + $if ACCUMULATORS > 1: + $for K in range(UNROLL): + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M, K)}x${ABC[N:N+4]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb${K}x${ABC[N:N+4]}, va${M}x${K}x0123, ${VACC(M, K)}x${ABC[N:N+4]}); + $else: + $for K in range(UNROLL): + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb${K}x${ABC[N:N+4]}, va${M}x${K}x0123, ${VACC(M)}x${ABC[N:N+4]}); + + w = (const int8_t*) w + ${NR * 4 * UNROLL}; + k -= ${UNROLL * 4} * sizeof(int8_t); + } + $if ACCUMULATORS > 1: + $PAIRS = [(i,) for i in range(UNROLL)] + $while len(PAIRS) > 1: + $TPLS=[PAIRS[i:i+2] for i in range(0, len(PAIRS), 2)] + $PAIRS = [(P1[0],P2[0]) for P1, P2 in TPLS] + $for K1, K2 in PAIRS: + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_add(${VACC(M, K1)}x${ABC[N:N+4]}, ${VACC(M, K2)}x${ABC[N:N+4]}); + + while (k != 0) { + $for M in range(MR): + v128_t va${M}x0123 = wasm_v128_load32_splat(a${M}); + a${M} += 4; + + $if not SDOT: + $for M in range(MR): + va${M}x0123 = wasm_v128_xor(va${M}x0123, vsign_mask); + + $for N in range(0, NR, 4): + $if N == 0: + const v128_t vb${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w); + $else: + const v128_t vb${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w + ${4 * N}); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb${ABC[N:N+4]}, va${M}x0123, ${VACC(M)}x${ABC[N:N+4]}); + + w = (const int8_t*) w + ${NR * 4}; + k -= 4 * sizeof(int8_t); + } + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_convert_i32x4(${VACC(M)}x${ABC[N:N+4]}); + + $if DATATYPE == "QD8": + $for M in range(MR): + const v128_t vinput_scale${M} = wasm_v128_load32_splat(&quantization_params[${M}].inv_scale); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_mul(${VACC(M)}x${ABC[N:N+4]}, vinput_scale${M}); + + $for N in range(0, NR, 4): + const v128_t vfilter_output_scale${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_mul(${VACC(M)}x${ABC[N:N+4]}, vfilter_output_scale${ABC[N:N+4]}); + + $for N in range(0, NR, 4): + const v128_t vbias${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_add(${VACC(M)}x${ABC[N:N+4]}, vbias${ABC[N:N+4]}); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_pmax(${VACC(M)}x${ABC[N:N+4]}, vmin); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_pmin(${VACC(M)}x${ABC[N:N+4]}, vmax); + + if XNN_LIKELY(nc >= ${NR}) { + $for M in range(MR): + wasm_v128_store(c${M}, ${VACC(M)}x${ABC[0:4]}); + $for N in range(4, NR, 4): + wasm_v128_store(c${M} + ${N}, ${VACC(M)}x${ABC[N:N+4]}); + + $for M in range(MR): + a${M} = (const int8_t*) ((uintptr_t) a${M} - kc); + + $for M in range(MR): + c${M} = (float*) ((uintptr_t) c${M} + cn_stride); + + nc -= ${NR}; + } else { + $if NR == 16: + if (nc & 8) { + $for M in range(MR): + $for N in range(0, 8, 4): + wasm_v128_store(c${M}, ${VACC(M)}x${ABC[N:N+4]}); + ${VACC(M)}x${ABC[N:N+4]} = ${VACC(M)}x${ABC[N+8:N+12]}; + c${M} += 4; + } + if (nc & 4) { + $for M in range(MR): + wasm_v128_store(c${M}, ${VACC(M)}x0123); + ${VACC(M)}x0123 = ${VACC(M)}x4567; + c${M} += 4; + } + if (nc & 2) { + $for M in range(MR): + wasm_v128_store64_lane(c${M}, ${VACC(M)}x0123, 0); + ${VACC(M)}x0123 = wasm_v64x2_shuffle(${VACC(M)}x0123, ${VACC(M)}x0123, 1, 1); + c${M} += 2; + } + if (nc & 1) { + $for M in range(MR): + wasm_v128_store32_lane(c${M}, ${VACC(M)}x0123, 0); + } + nc = 0; + } + $else: + $if DATATYPE == "QC8": + $for N in range(0, NR, 4): + const v128_t vscale${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_mul(${VACC(M)}x${ABC[N:N+4]}, vscale${ABC[N:N+4]}); + $else: + $for M in range(MR): + ${VACC(M)}x0123 = wasm_f32x4_mul(${VACC(M)}x0123, vscale); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_add(${VACC(M)}x${ABC[N:N+4]}, vmagic_bias); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_max(${VACC(M)}x${ABC[N:N+4]}, vmagic_min); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_sub(${VACC(M)}x${ABC[N:N+4]}, vmagic_bias_less_output_zero_point); + + $for M in range(MR): + $for N in range(0, NR, 8): + v128_t vacc${M}x${ABC[N:N+8]} = wasm_i16x8_narrow_i32x4(${VACC(M)}x${ABC[N:N+4]}, ${VACC(M)}x${ABC[N+4:N+8]}); + + $for M in range(0, MR): + $for N in range(0, NR, 8): + vacc${M}x${ABC[N:N+8]} = wasm_i8x16_narrow_i16x8(vacc${M}x${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]}); + + $for M in range(MR): + $for N in range(0, NR, 8): + vacc${M}x${ABC[N:N+8]} = wasm_i8x16_min(vacc${M}x${ABC[N:N+8]}, voutput_max); + + if XNN_LIKELY(nc >= ${NR}) { + $for M in range(MR): + wasm_v128_store64_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + $for N in range(8, NR, 8): + wasm_v128_store64_lane(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}, 0); + + $for M in range(MR): + c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride); + + $for M in range(MR): + a${M} = (const int8_t*) ((uintptr_t) a${M} - kc); + + nc -= ${NR}; + } else { + $if NR == 16: + if (nc & 8) { + $for M in range(MR): + wasm_v128_store64_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + c${M} += 8; + + $for M in range(MR): + vacc${M}x${ABC[0:8]} = vacc${M}x${ABC[8:16]}; + } + if (nc & 4) { + $for M in range(MR): + wasm_v128_store32_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + c${M} += 4; + + $for M in range(MR): + vacc${M}x${ABC[0:8]} = wasm_u64x2_shr(vacc${M}x${ABC[0:8]}, 32); + } + if (nc & 2) { + $for M in range(MR): + wasm_v128_store16_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + c${M} += 2; + + $for M in range(MR): + vacc${M}x${ABC[0:8]} = wasm_u32x4_shr(vacc${M}x${ABC[0:8]}, 16); + } + if (nc & 1) { + $for M in range(MR): + wasm_v128_store8_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-igemm/MRx16c4-wasmdot.c.in b/src/qs8-igemm/MRx16c4-wasmdot.c.in new file mode 100644 index 00000000000..d8afd8cd893 --- /dev/null +++ b/src/qs8-igemm/MRx16c4-wasmdot.c.in @@ -0,0 +1,366 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert DATATYPE in ["QC8", "QD8", "QS8"] +$assert DATATYPE == "QD8" or REQUANTIZATION == "FP32" +$assert DATATYPE != "QD8" or not REQUANTIZATION +$assert NR == 16 +$UNROLL = locals().get("UNROLL", 1) +$assert ACCUMULATORS == UNROLL or ACCUMULATORS == 1 +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUV" +$DATATYPE_SPEC = {"QC8": "qs8_qc8w", "QD8": "qd8_f32_qc8w", "QS8": "qs8"}[DATATYPE] +$REQUANTIZATION_SPEC = "_" + REQUANTIZATION.lower() if REQUANTIZATION else "" +$PARAMS_TYPE = {"QC8": "union xnn_qs8_qc8w_conv_minmax_params", "QD8": "union xnn_f32_minmax_params", "QS8": "union xnn_qs8_conv_minmax_params"}[DATATYPE] +$OUT_T = "float" if DATATYPE == "QD8" else "int8_t" +$def VACC(M,K=0): +$ return f"vacc{M}x{K}" if UNROLL > 1 else f"vacc{M}" +$ACC_POSTFIX=f"_acc{ACCUMULATORS}" if ACCUMULATORS > 1 else "" +void xnn_${DATATYPE_SPEC}_igemm_minmax${REQUANTIZATION_SPEC}_ukernel_${MR}x${NR}c4__wasm${"sdot" if SDOT else "usdot"}${"_u" + str(UNROLL) if UNROLL > 1 else ""}${ACC_POSTFIX}( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + ${OUT_T}* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + $if DATATYPE in ["QD8"]: + const int8_t* zero_data, + const ${PARAMS_TYPE} params[restrict XNN_MIN_ELEMENTS(1)], + const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS + $else: + const ${PARAMS_TYPE} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= ${MR}); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (${MR} * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + ${OUT_T}* c0 = c; + $for M in range(1, MR): + ${OUT_T}* c${M} = (${OUT_T}*) ((uintptr_t) c${M-1} + cm_stride); + $if M % 2 == 0: + if XNN_UNPREDICTABLE(mr <= ${M}) { + c${M} = c${M-1}; + } + $elif M + 1 == MR: + if XNN_UNPREDICTABLE(mr != ${M+1}) { + c${M} = c${M-1}; + } + $else: + if XNN_UNPREDICTABLE(mr < ${M+1}) { + c${M} = c${M-1}; + } + + $if DATATYPE == "QD8": + const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); + const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); + XNN_FORCE_REALIZATION(vmin); + XNN_FORCE_REALIZATION(vmax); + $else: + $if DATATYPE != "QC8": + const v128_t vscale = wasm_v128_load32_splat(¶ms->fp32_scalar.scale); + XNN_FORCE_REALIZATION(vscale); + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + $if not SDOT: + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + $if DATATYPE == "QD8": + v128_t vksum${ABC[0:4]} = wasm_v128_load((const int32_t*) w); + $for N in range(4, NR, 4): + v128_t vksum${ABC[N:N+4]} = wasm_v128_load((const int32_t*) w + ${N}); + $if SDOT: + const v128_t vinput_zero_point = wasm_v128_load32_splat(&quantization_params->zero_point); + $else: + const v128_t vinput_zero_point = wasm_i32x4_splat((int32_t) quantization_params->zero_point + 128); + $for M in range(MR): + $for N in range(0, NR, 4): + v128_t ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_mul(vksum${ABC[N:N+4]}, vinput_zero_point); + $if ACCUMULATORS > 1: + $for K in range(1, UNROLL): + v128_t ${VACC(M, K)}x${ABC[N:N+4]} = wasm_u32x4_const(0, 0, 0, 0); + $else: + v128_t ${VACC(0)}x0123 = wasm_v128_load(w); + $for N in range(4, NR, 4): + v128_t ${VACC(0)}x${ABC[N:N+4]} = wasm_v128_load((const int32_t*) w + ${N}); + $if ACCUMULATORS > 1: + $for K in range(1, UNROLL): + $for N in range(0, NR, 4): + v128_t ${VACC(0, K)}x${ABC[N:N+4]} = wasm_u32x4_const(0, 0, 0, 0); + $for M in range(1, MR, 1): + $for N in range(0, NR, 4): + $if ACCUMULATORS > 1: + $for K in range(UNROLL): + v128_t ${VACC(M, K)}x${ABC[N:N+4]}= ${VACC(0, K)}x${ABC[N:N+4]}; + $else: + v128_t ${VACC(M)}x${ABC[N:N+4]}= ${VACC(0)}x${ABC[N:N+4]}; + w = (const int32_t*) w + ${NR}; + + size_t p = ks; + do { + $for M in range(MR): + const int8_t* restrict a${M} = a[${M}]; + if XNN_UNPREDICTABLE(a${M} != zero) { + a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset); + $if DATATYPE == "QD8": + } else { + a${M} = zero_data; + } + a += ${MR}; + + size_t k = kc; + $if UNROLL > 1: + while (k >= ${UNROLL * 4} * sizeof(int8_t)) { + $for M in range(MR): + v128_t va${M}x0x0123 = wasm_v128_load32_splat(a${M}); + $for K in range(1, UNROLL): + v128_t va${M}x${K}x0123 = wasm_v128_load32_splat(a${M} + ${4 * K}); + a${M} += ${4 * UNROLL}; + + $if not SDOT: + $for M in range(MR): + $for K in range(UNROLL): + va${M}x${K}x0123 = wasm_v128_xor(va${M}x${K}x0123, vsign_mask); + + $for K in range(UNROLL): + $for N in range(0, NR, 4): + $if N == 0 and K == 0: + const v128_t vb${K}x${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w); + $else: + const v128_t vb${K}x${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w + ${4 * N + 4 * NR * K}); + + $if ACCUMULATORS > 1: + $for K in range(UNROLL): + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M, K)}x${ABC[N:N+4]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb${K}x${ABC[N:N+4]}, va${M}x${K}x0123, ${VACC(M, K)}x${ABC[N:N+4]}); + $else: + $for K in range(UNROLL): + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb${K}x${ABC[N:N+4]}, va${M}x${K}x0123, ${VACC(M)}x${ABC[N:N+4]}); + + w = (const int8_t*) w + ${NR * 4 * UNROLL}; + k -= ${UNROLL * 4} * sizeof(int8_t); + } + $if ACCUMULATORS > 1: + $PAIRS = [(i,) for i in range(UNROLL)] + $while len(PAIRS) > 1: + $TPLS=[PAIRS[i:i+2] for i in range(0, len(PAIRS), 2)] + $PAIRS = [(P1[0],P2[0]) for P1, P2 in TPLS] + $for K1, K2 in PAIRS: + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_add(${VACC(M, K1)}x${ABC[N:N+4]}, ${VACC(M, K2)}x${ABC[N:N+4]}); + + while (k != 0) { + $for M in range(MR): + v128_t va${M}x0123 = wasm_v128_load32_splat(a${M}); + a${M} += 4; + + $if not SDOT: + $for M in range(MR): + va${M}x0123 = wasm_v128_xor(va${M}x0123, vsign_mask); + + $for N in range(0, NR, 4): + $if N == 0: + const v128_t vb${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w); + $else: + const v128_t vb${ABC[N:N+4]} = wasm_v128_load((const int8_t*) w + ${4 * N}); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb${ABC[N:N+4]}, va${M}x0123, ${VACC(M)}x${ABC[N:N+4]}); + + w = (const int8_t*) w + ${NR * 4}; + k -= 4 * sizeof(int8_t); + } + + p -= ${MR} * sizeof(void*); + } while (p != 0); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_convert_i32x4(${VACC(M)}x${ABC[N:N+4]}); + + $if DATATYPE == "QD8": + const v128_t vinput_scale = wasm_v128_load32_splat(&quantization_params->inv_scale); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_mul(${VACC(M)}x${ABC[N:N+4]}, vinput_scale); + + $for N in range(0, NR, 4): + const v128_t vfilter_output_scale${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_mul(${VACC(M)}x${ABC[N:N+4]}, vfilter_output_scale${ABC[N:N+4]}); + + $for N in range(0, NR, 4): + const v128_t vbias${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_add(${VACC(M)}x${ABC[N:N+4]}, vbias${ABC[N:N+4]}); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_pmax(${VACC(M)}x${ABC[N:N+4]}, vmin); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_pmin(${VACC(M)}x${ABC[N:N+4]}, vmax); + + if XNN_LIKELY(nc >= ${NR}) { + $for M in reversed(range(MR)): + wasm_v128_store(c${M}, ${VACC(M)}x${ABC[0:4]}); + $for N in range(4, NR, 4): + wasm_v128_store(c${M} + ${N}, ${VACC(M)}x${ABC[N:N+4]}); + + $for M in range(MR): + c${M} = (float*) ((uintptr_t) c${M} + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= ${NR}; + } else { + $if NR == 16: + if (nc & 8) { + $for M in reversed(range(MR)): + $for N in range(0, 8, 4): + wasm_v128_store(c${M}, ${VACC(M)}x${ABC[N:N+4]}); + ${VACC(M)}x${ABC[N:N+4]} = ${VACC(M)}x${ABC[N+8:N+12]}; + c${M} += 4; + } + if (nc & 4) { + $for M in reversed(range(MR)): + wasm_v128_store(c${M}, ${VACC(M)}x0123); + ${VACC(M)}x0123 = ${VACC(M)}x4567; + c${M} += 4; + } + if (nc & 2) { + $for M in reversed(range(MR)): + wasm_v128_store64_lane(c${M}, ${VACC(M)}x0123, 0); + ${VACC(M)}x0123 = wasm_v64x2_shuffle(${VACC(M)}x0123, ${VACC(M)}x0123, 1, 1); + c${M} += 2; + } + if (nc & 1) { + $for M in reversed(range(MR)): + wasm_v128_store32_lane(c${M}, ${VACC(M)}x0123, 0); + } + nc = 0; + } + $else: + $if DATATYPE == "QC8": + $for N in range(0, NR, 4): + const v128_t vscale${ABC[N:N+4]} = wasm_v128_load(w); + w = (const float*) w + 4; + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_mul(${VACC(M)}x${ABC[N:N+4]}, vscale${ABC[N:N+4]}); + $else: + $for M in range(MR): + ${VACC(M)}x0123 = wasm_f32x4_mul(${VACC(M)}x0123, vscale); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_f32x4_add(${VACC(M)}x${ABC[N:N+4]}, vmagic_bias); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_max(${VACC(M)}x${ABC[N:N+4]}, vmagic_min); + + $for M in range(MR): + $for N in range(0, NR, 4): + ${VACC(M)}x${ABC[N:N+4]} = wasm_i32x4_sub(${VACC(M)}x${ABC[N:N+4]}, vmagic_bias_less_output_zero_point); + + $for M in range(MR): + $for N in range(0, NR, 8): + v128_t vacc${M}x${ABC[N:N+8]} = wasm_i16x8_narrow_i32x4(${VACC(M)}x${ABC[N:N+4]}, ${VACC(M)}x${ABC[N+4:N+8]}); + + $for M in range(MR): + $for N in range(0, NR, 8): + vacc${M}x${ABC[N:N+8]} = wasm_i8x16_narrow_i16x8(vacc${M}x${ABC[N:N+8]}, vacc${M}x${ABC[N:N+8]}); + + $for M in range(MR): + $for N in range(0, NR, 8): + vacc${M}x${ABC[N:N+8]} = wasm_i8x16_min(vacc${M}x${ABC[N:N+8]}, voutput_max); + + if (nc >= ${NR}) { + $for M in reversed(range(MR)): + wasm_v128_store64_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + $for N in range(8, NR, 8): + wasm_v128_store64_lane(c${M} + ${N}, vacc${M}x${ABC[N:N+8]}, 0); + + $for M in reversed(range(MR)): + c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= ${NR}; + } else { + $if NR == 16: + if (nc & 8) { + $for M in reversed(range(MR)): + wasm_v128_store64_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + c${M} += 8; + + $for M in range(MR): + vacc${M}x${ABC[0:8]} = vacc${M}x${ABC[8:16]}; + } + if (nc & 4) { + $for M in reversed(range(MR)): + wasm_v128_store32_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + c${M} += 4; + + $for M in range(MR): + vacc${M}x${ABC[0:8]} = wasm_u64x2_shr(vacc${M}x${ABC[0:8]}, 32); + } + if (nc & 2) { + $for M in reversed(range(MR)): + wasm_v128_store16_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + c${M} += 2; + + $for M in range(MR): + vacc${M}x${ABC[0:8]} = wasm_u32x4_shr(vacc${M}x${ABC[0:8]}, 16); + } + if (nc & 1) { + $for M in reversed(range(MR)): + wasm_v128_store8_lane(c${M}, vacc${M}x${ABC[0:8]}, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c new file mode 100644 index 00000000000..455ee98d81e --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c @@ -0,0 +1,195 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2.c new file mode 100644 index 00000000000..bdafd15287a --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot-u2.c @@ -0,0 +1,187 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot.c new file mode 100644 index 00000000000..88420060aeb --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmsdot.c @@ -0,0 +1,160 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c new file mode 100644 index 00000000000..92949935263 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c @@ -0,0 +1,199 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2.c new file mode 100644 index 00000000000..182f24d4b56 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot-u2.c @@ -0,0 +1,191 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot.c new file mode 100644 index 00000000000..81e725d9601 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c4-minmax-fp32-wasmusdot.c @@ -0,0 +1,162 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c new file mode 100644 index 00000000000..eed0518b5fe --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c @@ -0,0 +1,345 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2.c new file mode 100644 index 00000000000..05e1d6be5be --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot-u2.c @@ -0,0 +1,321 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot.c new file mode 100644 index 00000000000..a30541b12f7 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmsdot.c @@ -0,0 +1,272 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c new file mode 100644 index 00000000000..11a35bdb3a7 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c @@ -0,0 +1,355 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2.c new file mode 100644 index 00000000000..71dda4d2627 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot-u2.c @@ -0,0 +1,331 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot.c new file mode 100644 index 00000000000..9cabb00b3a3 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x16c4-minmax-fp32-wasmusdot.c @@ -0,0 +1,276 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c new file mode 100644 index 00000000000..eb1f57d74bf --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c @@ -0,0 +1,420 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x1x0123= vacc0x1x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x1x4567= vacc0x1x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x1x89AB= vacc0x1x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x1xCDEF= vacc0x1xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + vacc3x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x1x0123); + vacc3x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x1x4567); + vacc3x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x1x89AB); + vacc3x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + vacc3x0x0123 = wasm_i32x4_add(vacc3x0x0123, vacc3x1x0123); + vacc3x0x4567 = wasm_i32x4_add(vacc3x0x4567, vacc3x1x4567); + vacc3x0x89AB = wasm_i32x4_add(vacc3x0x89AB, vacc3x1x89AB); + vacc3x0xCDEF = wasm_i32x4_add(vacc3x0xCDEF, vacc3x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2.c new file mode 100644 index 00000000000..0a11daa991d --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot-u2.c @@ -0,0 +1,388 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot.c new file mode 100644 index 00000000000..8fe9799f523 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmsdot.c @@ -0,0 +1,328 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + v128_t vacc3x0123= vacc0x0123; + v128_t vacc3x4567= vacc0x4567; + v128_t vacc3x89AB= vacc0x89AB; + v128_t vacc3xCDEF= vacc0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + vacc3x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0123); + vacc3x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x4567); + vacc3x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x89AB); + vacc3xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + vacc3x89AB = wasm_f32x4_convert_i32x4(vacc3x89AB); + vacc3xCDEF = wasm_f32x4_convert_i32x4(vacc3xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vscale89AB); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vmagic_bias); + vacc3x89AB = wasm_f32x4_add(vacc3x89AB, vmagic_bias); + vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); + vacc3x4567 = wasm_i32x4_max(vacc3x4567, vmagic_min); + vacc3x89AB = wasm_i32x4_max(vacc3x89AB, vmagic_min); + vacc3xCDEF = wasm_i32x4_max(vacc3xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); + vacc3x4567 = wasm_i32x4_sub(vacc3x4567, vmagic_bias_less_output_zero_point); + vacc3x89AB = wasm_i32x4_sub(vacc3x89AB, vmagic_bias_less_output_zero_point); + vacc3xCDEF = wasm_i32x4_sub(vacc3xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x89AB, vacc3xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c new file mode 100644 index 00000000000..69eff7ba7e0 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c @@ -0,0 +1,433 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x1x0123= vacc0x1x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x1x4567= vacc0x1x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x1x89AB= vacc0x1x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x1xCDEF= vacc0x1xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + va3x0x0123 = wasm_v128_xor(va3x0x0123, vsign_mask); + va3x1x0123 = wasm_v128_xor(va3x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + vacc3x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x1x0123); + vacc3x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x1x4567); + vacc3x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x1x89AB); + vacc3x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + vacc3x0x0123 = wasm_i32x4_add(vacc3x0x0123, vacc3x1x0123); + vacc3x0x4567 = wasm_i32x4_add(vacc3x0x4567, vacc3x1x4567); + vacc3x0x89AB = wasm_i32x4_add(vacc3x0x89AB, vacc3x1x89AB); + vacc3x0xCDEF = wasm_i32x4_add(vacc3x0xCDEF, vacc3x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2.c new file mode 100644 index 00000000000..101fc2c48e0 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot-u2.c @@ -0,0 +1,401 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + va3x0x0123 = wasm_v128_xor(va3x0x0123, vsign_mask); + va3x1x0123 = wasm_v128_xor(va3x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot.c new file mode 100644 index 00000000000..e1caa8c91c4 --- /dev/null +++ b/src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c4-minmax-fp32-wasmusdot.c @@ -0,0 +1,333 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-gemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + +void xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + const int8_t* restrict a, + size_t a_stride, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + const int8_t* a0 = a; + int8_t* c0 = c; + const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride); + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + a1 = a0; + c1 = c0; + } + const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride); + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + a2 = a1; + c2 = c1; + } + const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride); + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + a3 = a2; + c3 = c2; + } + + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + v128_t vacc3x0123= vacc0x0123; + v128_t vacc3x4567= vacc0x4567; + v128_t vacc3x89AB= vacc0x89AB; + v128_t vacc3xCDEF= vacc0xCDEF; + w = (const int32_t*) w + 16; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + vacc3x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0123); + vacc3x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x4567); + vacc3x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x89AB); + vacc3xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + vacc3x89AB = wasm_f32x4_convert_i32x4(vacc3x89AB); + vacc3xCDEF = wasm_f32x4_convert_i32x4(vacc3xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vscale89AB); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vmagic_bias); + vacc3x89AB = wasm_f32x4_add(vacc3x89AB, vmagic_bias); + vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); + vacc3x4567 = wasm_i32x4_max(vacc3x4567, vmagic_min); + vacc3x89AB = wasm_i32x4_max(vacc3x89AB, vmagic_min); + vacc3xCDEF = wasm_i32x4_max(vacc3xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); + vacc3x4567 = wasm_i32x4_sub(vacc3x4567, vmagic_bias_less_output_zero_point); + vacc3x89AB = wasm_i32x4_sub(vacc3x89AB, vmagic_bias_less_output_zero_point); + vacc3xCDEF = wasm_i32x4_sub(vacc3xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x89AB, vacc3xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if XNN_LIKELY(nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + + a0 = (const int8_t*) ((uintptr_t) a0 - kc); + a1 = (const int8_t*) ((uintptr_t) a1 - kc); + a2 = (const int8_t*) ((uintptr_t) a2 - kc); + a3 = (const int8_t*) ((uintptr_t) a3 - kc); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c new file mode 100644 index 00000000000..862e4a3bce4 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2-acc2.c @@ -0,0 +1,209 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + a += 1; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2.c new file mode 100644 index 00000000000..516336956ae --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot-u2.c @@ -0,0 +1,201 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + a += 1; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot.c new file mode 100644 index 00000000000..a8d6924d025 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmsdot.c @@ -0,0 +1,174 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + a += 1; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c new file mode 100644 index 00000000000..897f8631c99 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2-acc2.c @@ -0,0 +1,212 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + a += 1; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2.c new file mode 100644 index 00000000000..7e0ca900270 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot-u2.c @@ -0,0 +1,204 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + a += 1; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot.c new file mode 100644 index 00000000000..323659ea6c5 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c4-minmax-fp32-wasmusdot.c @@ -0,0 +1,175 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (1 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + a += 1; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 1 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c new file mode 100644 index 00000000000..a71ef46f907 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2-acc2.c @@ -0,0 +1,361 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (3 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + a += 3; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 3 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2.c new file mode 100644 index 00000000000..591f1830982 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot-u2.c @@ -0,0 +1,337 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (3 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + a += 3; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 3 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot.c new file mode 100644 index 00000000000..1e5d738fb41 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmsdot.c @@ -0,0 +1,288 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (3 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + a += 3; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 3 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c new file mode 100644 index 00000000000..425e098cf96 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2-acc2.c @@ -0,0 +1,370 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (3 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + a += 3; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 3 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2.c new file mode 100644 index 00000000000..be1b9fd4a15 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot-u2.c @@ -0,0 +1,346 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (3 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + a += 3; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 3 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot.c new file mode 100644 index 00000000000..20468d16bec --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-3x16c4-minmax-fp32-wasmusdot.c @@ -0,0 +1,291 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 3); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (3 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + a += 3; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 3 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c new file mode 100644 index 00000000000..fbdba44ea9e --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2-acc2.c @@ -0,0 +1,437 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x1x0123= vacc0x1x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x1x4567= vacc0x1x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x1x89AB= vacc0x1x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x1xCDEF= vacc0x1xCDEF; + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } + a += 4; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + vacc3x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x1x0123); + vacc3x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x1x4567); + vacc3x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x1x89AB); + vacc3x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + vacc3x0x0123 = wasm_i32x4_add(vacc3x0x0123, vacc3x1x0123); + vacc3x0x4567 = wasm_i32x4_add(vacc3x0x4567, vacc3x1x4567); + vacc3x0x89AB = wasm_i32x4_add(vacc3x0x89AB, vacc3x1x89AB); + vacc3x0xCDEF = wasm_i32x4_add(vacc3x0xCDEF, vacc3x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2.c new file mode 100644 index 00000000000..c2241cff4ab --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot-u2.c @@ -0,0 +1,405 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } + a += 4; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot.c new file mode 100644 index 00000000000..9bf0a687730 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmsdot.c @@ -0,0 +1,345 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + v128_t vacc3x0123= vacc0x0123; + v128_t vacc3x4567= vacc0x4567; + v128_t vacc3x89AB= vacc0x89AB; + v128_t vacc3xCDEF= vacc0xCDEF; + + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } + a += 4; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + vacc3x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0123); + vacc3x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x4567); + vacc3x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x89AB); + vacc3xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + vacc3x89AB = wasm_f32x4_convert_i32x4(vacc3x89AB); + vacc3xCDEF = wasm_f32x4_convert_i32x4(vacc3xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vscale89AB); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vmagic_bias); + vacc3x89AB = wasm_f32x4_add(vacc3x89AB, vmagic_bias); + vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); + vacc3x4567 = wasm_i32x4_max(vacc3x4567, vmagic_min); + vacc3x89AB = wasm_i32x4_max(vacc3x89AB, vmagic_min); + vacc3xCDEF = wasm_i32x4_max(vacc3xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); + vacc3x4567 = wasm_i32x4_sub(vacc3x4567, vmagic_bias_less_output_zero_point); + vacc3x89AB = wasm_i32x4_sub(vacc3x89AB, vmagic_bias_less_output_zero_point); + vacc3xCDEF = wasm_i32x4_sub(vacc3xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x89AB, vacc3xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c new file mode 100644 index 00000000000..f7495d81236 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2-acc2.c @@ -0,0 +1,449 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc0x1x0123 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x4567 = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1x89AB = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc0x1xCDEF = wasm_u32x4_const(0, 0, 0, 0); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x1x0123= vacc0x1x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x1x4567= vacc0x1x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x1x89AB= vacc0x1x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc1x1xCDEF= vacc0x1xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x1x0123= vacc0x1x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x1x4567= vacc0x1x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x1x89AB= vacc0x1x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x1xCDEF= vacc0x1xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x1x0123= vacc0x1x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x1x4567= vacc0x1x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x1x89AB= vacc0x1x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x1xCDEF= vacc0x1xCDEF; + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } + a += 4; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + va3x0x0123 = wasm_v128_xor(va3x0x0123, vsign_mask); + va3x1x0123 = wasm_v128_xor(va3x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x1x0123); + vacc0x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x1x4567); + vacc0x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x1x89AB); + vacc0x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x1xCDEF); + vacc1x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x1x0123); + vacc1x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x1x4567); + vacc1x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x1x89AB); + vacc1x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x1xCDEF); + vacc2x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x1x0123); + vacc2x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x1x4567); + vacc2x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x1x89AB); + vacc2x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x1xCDEF); + vacc3x1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x1x0123); + vacc3x1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x1x4567); + vacc3x1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x1x89AB); + vacc3x1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x1xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + vacc0x0x0123 = wasm_i32x4_add(vacc0x0x0123, vacc0x1x0123); + vacc0x0x4567 = wasm_i32x4_add(vacc0x0x4567, vacc0x1x4567); + vacc0x0x89AB = wasm_i32x4_add(vacc0x0x89AB, vacc0x1x89AB); + vacc0x0xCDEF = wasm_i32x4_add(vacc0x0xCDEF, vacc0x1xCDEF); + vacc1x0x0123 = wasm_i32x4_add(vacc1x0x0123, vacc1x1x0123); + vacc1x0x4567 = wasm_i32x4_add(vacc1x0x4567, vacc1x1x4567); + vacc1x0x89AB = wasm_i32x4_add(vacc1x0x89AB, vacc1x1x89AB); + vacc1x0xCDEF = wasm_i32x4_add(vacc1x0xCDEF, vacc1x1xCDEF); + vacc2x0x0123 = wasm_i32x4_add(vacc2x0x0123, vacc2x1x0123); + vacc2x0x4567 = wasm_i32x4_add(vacc2x0x4567, vacc2x1x4567); + vacc2x0x89AB = wasm_i32x4_add(vacc2x0x89AB, vacc2x1x89AB); + vacc2x0xCDEF = wasm_i32x4_add(vacc2x0xCDEF, vacc2x1xCDEF); + vacc3x0x0123 = wasm_i32x4_add(vacc3x0x0123, vacc3x1x0123); + vacc3x0x4567 = wasm_i32x4_add(vacc3x0x4567, vacc3x1x4567); + vacc3x0x89AB = wasm_i32x4_add(vacc3x0x89AB, vacc3x1x89AB); + vacc3x0xCDEF = wasm_i32x4_add(vacc3x0xCDEF, vacc3x1xCDEF); + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c new file mode 100644 index 00000000000..8732f2ee615 --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot-u2.c @@ -0,0 +1,417 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0x0123 = wasm_v128_load(w); + v128_t vacc0x0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0x0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0x0123= vacc0x0x0123; + v128_t vacc1x0x4567= vacc0x0x4567; + v128_t vacc1x0x89AB= vacc0x0x89AB; + v128_t vacc1x0xCDEF= vacc0x0xCDEF; + v128_t vacc2x0x0123= vacc0x0x0123; + v128_t vacc2x0x4567= vacc0x0x4567; + v128_t vacc2x0x89AB= vacc0x0x89AB; + v128_t vacc2x0xCDEF= vacc0x0xCDEF; + v128_t vacc3x0x0123= vacc0x0x0123; + v128_t vacc3x0x4567= vacc0x0x4567; + v128_t vacc3x0x89AB= vacc0x0x89AB; + v128_t vacc3x0xCDEF= vacc0x0xCDEF; + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } + a += 4; + + size_t k = kc; + while (k >= 8 * sizeof(int8_t)) { + v128_t va0x0x0123 = wasm_v128_load32_splat(a0); + v128_t va0x1x0123 = wasm_v128_load32_splat(a0 + 4); + a0 += 8; + v128_t va1x0x0123 = wasm_v128_load32_splat(a1); + v128_t va1x1x0123 = wasm_v128_load32_splat(a1 + 4); + a1 += 8; + v128_t va2x0x0123 = wasm_v128_load32_splat(a2); + v128_t va2x1x0123 = wasm_v128_load32_splat(a2 + 4); + a2 += 8; + v128_t va3x0x0123 = wasm_v128_load32_splat(a3); + v128_t va3x1x0123 = wasm_v128_load32_splat(a3 + 4); + a3 += 8; + + va0x0x0123 = wasm_v128_xor(va0x0x0123, vsign_mask); + va0x1x0123 = wasm_v128_xor(va0x1x0123, vsign_mask); + va1x0x0123 = wasm_v128_xor(va1x0x0123, vsign_mask); + va1x1x0123 = wasm_v128_xor(va1x1x0123, vsign_mask); + va2x0x0123 = wasm_v128_xor(va2x0x0123, vsign_mask); + va2x1x0123 = wasm_v128_xor(va2x1x0123, vsign_mask); + va3x0x0123 = wasm_v128_xor(va3x0x0123, vsign_mask); + va3x1x0123 = wasm_v128_xor(va3x1x0123, vsign_mask); + + const v128_t vb0x0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb0x4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb0x89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vb0xCDEF = wasm_v128_load((const int8_t*) w + 48); + const v128_t vb1x0123 = wasm_v128_load((const int8_t*) w + 64); + const v128_t vb1x4567 = wasm_v128_load((const int8_t*) w + 80); + const v128_t vb1x89AB = wasm_v128_load((const int8_t*) w + 96); + const v128_t vb1xCDEF = wasm_v128_load((const int8_t*) w + 112); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va0x0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va0x0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va0x0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va0x0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va1x0x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va1x0x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va1x0x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va1x0x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va2x0x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va2x0x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va2x0x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va2x0x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x0123, va3x0x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x4567, va3x0x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0x89AB, va3x0x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0xCDEF, va3x0x0123, vacc3x0xCDEF); + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va0x1x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va0x1x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va0x1x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va0x1x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va1x1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va1x1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va1x1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va1x1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va2x1x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va2x1x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va2x1x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va2x1x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x0123, va3x1x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x4567, va3x1x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1x89AB, va3x1x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1xCDEF, va3x1x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 128; + k -= 8 * sizeof(int8_t); + } + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0x0123); + vacc0x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x0x4567); + vacc0x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x0x89AB); + vacc0x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0x0xCDEF); + vacc1x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0x0123); + vacc1x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x0x4567); + vacc1x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x0x89AB); + vacc1x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1x0xCDEF); + vacc2x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0x0123); + vacc2x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x0x4567); + vacc2x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x0x89AB); + vacc2x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2x0xCDEF); + vacc3x0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0x0123); + vacc3x0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x0x4567); + vacc3x0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x0x89AB); + vacc3x0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3x0xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0x0123 = wasm_f32x4_convert_i32x4(vacc0x0x0123); + vacc0x0x4567 = wasm_f32x4_convert_i32x4(vacc0x0x4567); + vacc0x0x89AB = wasm_f32x4_convert_i32x4(vacc0x0x89AB); + vacc0x0xCDEF = wasm_f32x4_convert_i32x4(vacc0x0xCDEF); + vacc1x0x0123 = wasm_f32x4_convert_i32x4(vacc1x0x0123); + vacc1x0x4567 = wasm_f32x4_convert_i32x4(vacc1x0x4567); + vacc1x0x89AB = wasm_f32x4_convert_i32x4(vacc1x0x89AB); + vacc1x0xCDEF = wasm_f32x4_convert_i32x4(vacc1x0xCDEF); + vacc2x0x0123 = wasm_f32x4_convert_i32x4(vacc2x0x0123); + vacc2x0x4567 = wasm_f32x4_convert_i32x4(vacc2x0x4567); + vacc2x0x89AB = wasm_f32x4_convert_i32x4(vacc2x0x89AB); + vacc2x0xCDEF = wasm_f32x4_convert_i32x4(vacc2x0xCDEF); + vacc3x0x0123 = wasm_f32x4_convert_i32x4(vacc3x0x0123); + vacc3x0x4567 = wasm_f32x4_convert_i32x4(vacc3x0x4567); + vacc3x0x89AB = wasm_f32x4_convert_i32x4(vacc3x0x89AB); + vacc3x0xCDEF = wasm_f32x4_convert_i32x4(vacc3x0xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0x0123 = wasm_f32x4_mul(vacc0x0x0123, vscale0123); + vacc0x0x4567 = wasm_f32x4_mul(vacc0x0x4567, vscale4567); + vacc0x0x89AB = wasm_f32x4_mul(vacc0x0x89AB, vscale89AB); + vacc0x0xCDEF = wasm_f32x4_mul(vacc0x0xCDEF, vscaleCDEF); + vacc1x0x0123 = wasm_f32x4_mul(vacc1x0x0123, vscale0123); + vacc1x0x4567 = wasm_f32x4_mul(vacc1x0x4567, vscale4567); + vacc1x0x89AB = wasm_f32x4_mul(vacc1x0x89AB, vscale89AB); + vacc1x0xCDEF = wasm_f32x4_mul(vacc1x0xCDEF, vscaleCDEF); + vacc2x0x0123 = wasm_f32x4_mul(vacc2x0x0123, vscale0123); + vacc2x0x4567 = wasm_f32x4_mul(vacc2x0x4567, vscale4567); + vacc2x0x89AB = wasm_f32x4_mul(vacc2x0x89AB, vscale89AB); + vacc2x0xCDEF = wasm_f32x4_mul(vacc2x0xCDEF, vscaleCDEF); + vacc3x0x0123 = wasm_f32x4_mul(vacc3x0x0123, vscale0123); + vacc3x0x4567 = wasm_f32x4_mul(vacc3x0x4567, vscale4567); + vacc3x0x89AB = wasm_f32x4_mul(vacc3x0x89AB, vscale89AB); + vacc3x0xCDEF = wasm_f32x4_mul(vacc3x0xCDEF, vscaleCDEF); + + vacc0x0x0123 = wasm_f32x4_add(vacc0x0x0123, vmagic_bias); + vacc0x0x4567 = wasm_f32x4_add(vacc0x0x4567, vmagic_bias); + vacc0x0x89AB = wasm_f32x4_add(vacc0x0x89AB, vmagic_bias); + vacc0x0xCDEF = wasm_f32x4_add(vacc0x0xCDEF, vmagic_bias); + vacc1x0x0123 = wasm_f32x4_add(vacc1x0x0123, vmagic_bias); + vacc1x0x4567 = wasm_f32x4_add(vacc1x0x4567, vmagic_bias); + vacc1x0x89AB = wasm_f32x4_add(vacc1x0x89AB, vmagic_bias); + vacc1x0xCDEF = wasm_f32x4_add(vacc1x0xCDEF, vmagic_bias); + vacc2x0x0123 = wasm_f32x4_add(vacc2x0x0123, vmagic_bias); + vacc2x0x4567 = wasm_f32x4_add(vacc2x0x4567, vmagic_bias); + vacc2x0x89AB = wasm_f32x4_add(vacc2x0x89AB, vmagic_bias); + vacc2x0xCDEF = wasm_f32x4_add(vacc2x0xCDEF, vmagic_bias); + vacc3x0x0123 = wasm_f32x4_add(vacc3x0x0123, vmagic_bias); + vacc3x0x4567 = wasm_f32x4_add(vacc3x0x4567, vmagic_bias); + vacc3x0x89AB = wasm_f32x4_add(vacc3x0x89AB, vmagic_bias); + vacc3x0xCDEF = wasm_f32x4_add(vacc3x0xCDEF, vmagic_bias); + + vacc0x0x0123 = wasm_i32x4_max(vacc0x0x0123, vmagic_min); + vacc0x0x4567 = wasm_i32x4_max(vacc0x0x4567, vmagic_min); + vacc0x0x89AB = wasm_i32x4_max(vacc0x0x89AB, vmagic_min); + vacc0x0xCDEF = wasm_i32x4_max(vacc0x0xCDEF, vmagic_min); + vacc1x0x0123 = wasm_i32x4_max(vacc1x0x0123, vmagic_min); + vacc1x0x4567 = wasm_i32x4_max(vacc1x0x4567, vmagic_min); + vacc1x0x89AB = wasm_i32x4_max(vacc1x0x89AB, vmagic_min); + vacc1x0xCDEF = wasm_i32x4_max(vacc1x0xCDEF, vmagic_min); + vacc2x0x0123 = wasm_i32x4_max(vacc2x0x0123, vmagic_min); + vacc2x0x4567 = wasm_i32x4_max(vacc2x0x4567, vmagic_min); + vacc2x0x89AB = wasm_i32x4_max(vacc2x0x89AB, vmagic_min); + vacc2x0xCDEF = wasm_i32x4_max(vacc2x0xCDEF, vmagic_min); + vacc3x0x0123 = wasm_i32x4_max(vacc3x0x0123, vmagic_min); + vacc3x0x4567 = wasm_i32x4_max(vacc3x0x4567, vmagic_min); + vacc3x0x89AB = wasm_i32x4_max(vacc3x0x89AB, vmagic_min); + vacc3x0xCDEF = wasm_i32x4_max(vacc3x0xCDEF, vmagic_min); + + vacc0x0x0123 = wasm_i32x4_sub(vacc0x0x0123, vmagic_bias_less_output_zero_point); + vacc0x0x4567 = wasm_i32x4_sub(vacc0x0x4567, vmagic_bias_less_output_zero_point); + vacc0x0x89AB = wasm_i32x4_sub(vacc0x0x89AB, vmagic_bias_less_output_zero_point); + vacc0x0xCDEF = wasm_i32x4_sub(vacc0x0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0x0123 = wasm_i32x4_sub(vacc1x0x0123, vmagic_bias_less_output_zero_point); + vacc1x0x4567 = wasm_i32x4_sub(vacc1x0x4567, vmagic_bias_less_output_zero_point); + vacc1x0x89AB = wasm_i32x4_sub(vacc1x0x89AB, vmagic_bias_less_output_zero_point); + vacc1x0xCDEF = wasm_i32x4_sub(vacc1x0xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0x0123 = wasm_i32x4_sub(vacc2x0x0123, vmagic_bias_less_output_zero_point); + vacc2x0x4567 = wasm_i32x4_sub(vacc2x0x4567, vmagic_bias_less_output_zero_point); + vacc2x0x89AB = wasm_i32x4_sub(vacc2x0x89AB, vmagic_bias_less_output_zero_point); + vacc2x0xCDEF = wasm_i32x4_sub(vacc2x0xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0x0123 = wasm_i32x4_sub(vacc3x0x0123, vmagic_bias_less_output_zero_point); + vacc3x0x4567 = wasm_i32x4_sub(vacc3x0x4567, vmagic_bias_less_output_zero_point); + vacc3x0x89AB = wasm_i32x4_sub(vacc3x0x89AB, vmagic_bias_less_output_zero_point); + vacc3x0xCDEF = wasm_i32x4_sub(vacc3x0xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0x0123, vacc0x0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x0x89AB, vacc0x0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0x0123, vacc1x0x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x0x89AB, vacc1x0xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0x0123, vacc2x0x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x0x89AB, vacc2x0xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0x0123, vacc3x0x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x0x89AB, vacc3x0xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c new file mode 100644 index 00000000000..12e9c533d5b --- /dev/null +++ b/src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x16c4-minmax-fp32-wasmusdot.c @@ -0,0 +1,349 @@ +// Auto-generated file. Do not edit! +// Template: src/qs8-igemm/MRx16c4-wasmdot.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "xnnpack/gemm.h" +#include "xnnpack/math.h" + + +void xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot( + size_t mr, + size_t nc, + size_t kc, + size_t ks, + const int8_t** restrict a, + const void* restrict w, + int8_t* restrict c, + size_t cm_stride, + size_t cn_stride, + size_t a_offset, + const int8_t* zero, + const union xnn_qs8_qc8w_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(ks != 0); + assert(ks % (4 * sizeof(void*)) == 0); + assert(a_offset % sizeof(int8_t) == 0); + assert(a != NULL); + assert(w != NULL); + assert(c != NULL); + + kc = round_up_po2(kc, 4 * sizeof(int8_t)); + int8_t* c0 = c; + int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const v128_t vmagic_bias = wasm_f32x4_const_splat(12582912.0f); + const int32_t output_min_less_zero_point = (int32_t) params->fp32_scalar.output_min - (int32_t) params->fp32_scalar.output_zero_point; + const v128_t vmagic_min = wasm_i32x4_splat((int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point)); + const v128_t vmagic_bias_less_output_zero_point = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) params->fp32_scalar.output_zero_point); + const v128_t voutput_max = wasm_i8x16_splat(params->fp32_scalar.output_max); + //XNN_FORCE_REALIZATION(vmagic_bias); + //XNN_FORCE_REALIZATION(vmagic_min); + //XNN_FORCE_REALIZATION(vmagic_bias_less_output_zero_point); + //XNN_FORCE_REALIZATION(voutput_max); + + const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80)); + XNN_FORCE_REALIZATION(vsign_mask); + do { + v128_t vacc0x0123 = wasm_v128_load(w); + v128_t vacc0x4567 = wasm_v128_load((const int32_t*) w + 4); + v128_t vacc0x89AB = wasm_v128_load((const int32_t*) w + 8); + v128_t vacc0xCDEF = wasm_v128_load((const int32_t*) w + 12); + v128_t vacc1x0123= vacc0x0123; + v128_t vacc1x4567= vacc0x4567; + v128_t vacc1x89AB= vacc0x89AB; + v128_t vacc1xCDEF= vacc0xCDEF; + v128_t vacc2x0123= vacc0x0123; + v128_t vacc2x4567= vacc0x4567; + v128_t vacc2x89AB= vacc0x89AB; + v128_t vacc2xCDEF= vacc0xCDEF; + v128_t vacc3x0123= vacc0x0123; + v128_t vacc3x4567= vacc0x4567; + v128_t vacc3x89AB= vacc0x89AB; + v128_t vacc3xCDEF= vacc0xCDEF; + w = (const int32_t*) w + 16; + + size_t p = ks; + do { + const int8_t* restrict a0 = a[0]; + if XNN_UNPREDICTABLE(a0 != zero) { + a0 = (const int8_t*) ((uintptr_t) a0 + a_offset); + } + const int8_t* restrict a1 = a[1]; + if XNN_UNPREDICTABLE(a1 != zero) { + a1 = (const int8_t*) ((uintptr_t) a1 + a_offset); + } + const int8_t* restrict a2 = a[2]; + if XNN_UNPREDICTABLE(a2 != zero) { + a2 = (const int8_t*) ((uintptr_t) a2 + a_offset); + } + const int8_t* restrict a3 = a[3]; + if XNN_UNPREDICTABLE(a3 != zero) { + a3 = (const int8_t*) ((uintptr_t) a3 + a_offset); + } + a += 4; + + size_t k = kc; + + while (k != 0) { + v128_t va0x0123 = wasm_v128_load32_splat(a0); + a0 += 4; + v128_t va1x0123 = wasm_v128_load32_splat(a1); + a1 += 4; + v128_t va2x0123 = wasm_v128_load32_splat(a2); + a2 += 4; + v128_t va3x0123 = wasm_v128_load32_splat(a3); + a3 += 4; + + va0x0123 = wasm_v128_xor(va0x0123, vsign_mask); + va1x0123 = wasm_v128_xor(va1x0123, vsign_mask); + va2x0123 = wasm_v128_xor(va2x0123, vsign_mask); + va3x0123 = wasm_v128_xor(va3x0123, vsign_mask); + + const v128_t vb0123 = wasm_v128_load((const int8_t*) w); + const v128_t vb4567 = wasm_v128_load((const int8_t*) w + 16); + const v128_t vb89AB = wasm_v128_load((const int8_t*) w + 32); + const v128_t vbCDEF = wasm_v128_load((const int8_t*) w + 48); + + vacc0x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va0x0123, vacc0x0123); + vacc0x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va0x0123, vacc0x4567); + vacc0x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va0x0123, vacc0x89AB); + vacc0xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va0x0123, vacc0xCDEF); + vacc1x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va1x0123, vacc1x0123); + vacc1x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va1x0123, vacc1x4567); + vacc1x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va1x0123, vacc1x89AB); + vacc1xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va1x0123, vacc1xCDEF); + vacc2x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va2x0123, vacc2x0123); + vacc2x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va2x0123, vacc2x4567); + vacc2x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va2x0123, vacc2x89AB); + vacc2xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va2x0123, vacc2xCDEF); + vacc3x0123 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0123, va3x0123, vacc3x0123); + vacc3x4567 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb4567, va3x0123, vacc3x4567); + vacc3x89AB = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb89AB, va3x0123, vacc3x89AB); + vacc3xCDEF = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vbCDEF, va3x0123, vacc3xCDEF); + + w = (const int8_t*) w + 64; + k -= 4 * sizeof(int8_t); + } + + p -= 4 * sizeof(void*); + } while (p != 0); + + vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123); + vacc0x4567 = wasm_f32x4_convert_i32x4(vacc0x4567); + vacc0x89AB = wasm_f32x4_convert_i32x4(vacc0x89AB); + vacc0xCDEF = wasm_f32x4_convert_i32x4(vacc0xCDEF); + vacc1x0123 = wasm_f32x4_convert_i32x4(vacc1x0123); + vacc1x4567 = wasm_f32x4_convert_i32x4(vacc1x4567); + vacc1x89AB = wasm_f32x4_convert_i32x4(vacc1x89AB); + vacc1xCDEF = wasm_f32x4_convert_i32x4(vacc1xCDEF); + vacc2x0123 = wasm_f32x4_convert_i32x4(vacc2x0123); + vacc2x4567 = wasm_f32x4_convert_i32x4(vacc2x4567); + vacc2x89AB = wasm_f32x4_convert_i32x4(vacc2x89AB); + vacc2xCDEF = wasm_f32x4_convert_i32x4(vacc2xCDEF); + vacc3x0123 = wasm_f32x4_convert_i32x4(vacc3x0123); + vacc3x4567 = wasm_f32x4_convert_i32x4(vacc3x4567); + vacc3x89AB = wasm_f32x4_convert_i32x4(vacc3x89AB); + vacc3xCDEF = wasm_f32x4_convert_i32x4(vacc3xCDEF); + + const v128_t vscale0123 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale4567 = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscale89AB = wasm_v128_load(w); + w = (const float*) w + 4; + const v128_t vscaleCDEF = wasm_v128_load(w); + w = (const float*) w + 4; + vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vscale0123); + vacc0x4567 = wasm_f32x4_mul(vacc0x4567, vscale4567); + vacc0x89AB = wasm_f32x4_mul(vacc0x89AB, vscale89AB); + vacc0xCDEF = wasm_f32x4_mul(vacc0xCDEF, vscaleCDEF); + vacc1x0123 = wasm_f32x4_mul(vacc1x0123, vscale0123); + vacc1x4567 = wasm_f32x4_mul(vacc1x4567, vscale4567); + vacc1x89AB = wasm_f32x4_mul(vacc1x89AB, vscale89AB); + vacc1xCDEF = wasm_f32x4_mul(vacc1xCDEF, vscaleCDEF); + vacc2x0123 = wasm_f32x4_mul(vacc2x0123, vscale0123); + vacc2x4567 = wasm_f32x4_mul(vacc2x4567, vscale4567); + vacc2x89AB = wasm_f32x4_mul(vacc2x89AB, vscale89AB); + vacc2xCDEF = wasm_f32x4_mul(vacc2xCDEF, vscaleCDEF); + vacc3x0123 = wasm_f32x4_mul(vacc3x0123, vscale0123); + vacc3x4567 = wasm_f32x4_mul(vacc3x4567, vscale4567); + vacc3x89AB = wasm_f32x4_mul(vacc3x89AB, vscale89AB); + vacc3xCDEF = wasm_f32x4_mul(vacc3xCDEF, vscaleCDEF); + + vacc0x0123 = wasm_f32x4_add(vacc0x0123, vmagic_bias); + vacc0x4567 = wasm_f32x4_add(vacc0x4567, vmagic_bias); + vacc0x89AB = wasm_f32x4_add(vacc0x89AB, vmagic_bias); + vacc0xCDEF = wasm_f32x4_add(vacc0xCDEF, vmagic_bias); + vacc1x0123 = wasm_f32x4_add(vacc1x0123, vmagic_bias); + vacc1x4567 = wasm_f32x4_add(vacc1x4567, vmagic_bias); + vacc1x89AB = wasm_f32x4_add(vacc1x89AB, vmagic_bias); + vacc1xCDEF = wasm_f32x4_add(vacc1xCDEF, vmagic_bias); + vacc2x0123 = wasm_f32x4_add(vacc2x0123, vmagic_bias); + vacc2x4567 = wasm_f32x4_add(vacc2x4567, vmagic_bias); + vacc2x89AB = wasm_f32x4_add(vacc2x89AB, vmagic_bias); + vacc2xCDEF = wasm_f32x4_add(vacc2xCDEF, vmagic_bias); + vacc3x0123 = wasm_f32x4_add(vacc3x0123, vmagic_bias); + vacc3x4567 = wasm_f32x4_add(vacc3x4567, vmagic_bias); + vacc3x89AB = wasm_f32x4_add(vacc3x89AB, vmagic_bias); + vacc3xCDEF = wasm_f32x4_add(vacc3xCDEF, vmagic_bias); + + vacc0x0123 = wasm_i32x4_max(vacc0x0123, vmagic_min); + vacc0x4567 = wasm_i32x4_max(vacc0x4567, vmagic_min); + vacc0x89AB = wasm_i32x4_max(vacc0x89AB, vmagic_min); + vacc0xCDEF = wasm_i32x4_max(vacc0xCDEF, vmagic_min); + vacc1x0123 = wasm_i32x4_max(vacc1x0123, vmagic_min); + vacc1x4567 = wasm_i32x4_max(vacc1x4567, vmagic_min); + vacc1x89AB = wasm_i32x4_max(vacc1x89AB, vmagic_min); + vacc1xCDEF = wasm_i32x4_max(vacc1xCDEF, vmagic_min); + vacc2x0123 = wasm_i32x4_max(vacc2x0123, vmagic_min); + vacc2x4567 = wasm_i32x4_max(vacc2x4567, vmagic_min); + vacc2x89AB = wasm_i32x4_max(vacc2x89AB, vmagic_min); + vacc2xCDEF = wasm_i32x4_max(vacc2xCDEF, vmagic_min); + vacc3x0123 = wasm_i32x4_max(vacc3x0123, vmagic_min); + vacc3x4567 = wasm_i32x4_max(vacc3x4567, vmagic_min); + vacc3x89AB = wasm_i32x4_max(vacc3x89AB, vmagic_min); + vacc3xCDEF = wasm_i32x4_max(vacc3xCDEF, vmagic_min); + + vacc0x0123 = wasm_i32x4_sub(vacc0x0123, vmagic_bias_less_output_zero_point); + vacc0x4567 = wasm_i32x4_sub(vacc0x4567, vmagic_bias_less_output_zero_point); + vacc0x89AB = wasm_i32x4_sub(vacc0x89AB, vmagic_bias_less_output_zero_point); + vacc0xCDEF = wasm_i32x4_sub(vacc0xCDEF, vmagic_bias_less_output_zero_point); + vacc1x0123 = wasm_i32x4_sub(vacc1x0123, vmagic_bias_less_output_zero_point); + vacc1x4567 = wasm_i32x4_sub(vacc1x4567, vmagic_bias_less_output_zero_point); + vacc1x89AB = wasm_i32x4_sub(vacc1x89AB, vmagic_bias_less_output_zero_point); + vacc1xCDEF = wasm_i32x4_sub(vacc1xCDEF, vmagic_bias_less_output_zero_point); + vacc2x0123 = wasm_i32x4_sub(vacc2x0123, vmagic_bias_less_output_zero_point); + vacc2x4567 = wasm_i32x4_sub(vacc2x4567, vmagic_bias_less_output_zero_point); + vacc2x89AB = wasm_i32x4_sub(vacc2x89AB, vmagic_bias_less_output_zero_point); + vacc2xCDEF = wasm_i32x4_sub(vacc2xCDEF, vmagic_bias_less_output_zero_point); + vacc3x0123 = wasm_i32x4_sub(vacc3x0123, vmagic_bias_less_output_zero_point); + vacc3x4567 = wasm_i32x4_sub(vacc3x4567, vmagic_bias_less_output_zero_point); + vacc3x89AB = wasm_i32x4_sub(vacc3x89AB, vmagic_bias_less_output_zero_point); + vacc3xCDEF = wasm_i32x4_sub(vacc3xCDEF, vmagic_bias_less_output_zero_point); + + v128_t vacc0x01234567 = wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x4567); + v128_t vacc0x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc0x89AB, vacc0xCDEF); + v128_t vacc1x01234567 = wasm_i16x8_narrow_i32x4(vacc1x0123, vacc1x4567); + v128_t vacc1x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc1x89AB, vacc1xCDEF); + v128_t vacc2x01234567 = wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x4567); + v128_t vacc2x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc2x89AB, vacc2xCDEF); + v128_t vacc3x01234567 = wasm_i16x8_narrow_i32x4(vacc3x0123, vacc3x4567); + v128_t vacc3x89ABCDEF = wasm_i16x8_narrow_i32x4(vacc3x89AB, vacc3xCDEF); + + vacc0x01234567 = wasm_i8x16_narrow_i16x8(vacc0x01234567, vacc0x01234567); + vacc0x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc0x89ABCDEF, vacc0x89ABCDEF); + vacc1x01234567 = wasm_i8x16_narrow_i16x8(vacc1x01234567, vacc1x01234567); + vacc1x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc1x89ABCDEF, vacc1x89ABCDEF); + vacc2x01234567 = wasm_i8x16_narrow_i16x8(vacc2x01234567, vacc2x01234567); + vacc2x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc2x89ABCDEF, vacc2x89ABCDEF); + vacc3x01234567 = wasm_i8x16_narrow_i16x8(vacc3x01234567, vacc3x01234567); + vacc3x89ABCDEF = wasm_i8x16_narrow_i16x8(vacc3x89ABCDEF, vacc3x89ABCDEF); + + vacc0x01234567 = wasm_i8x16_min(vacc0x01234567, voutput_max); + vacc0x89ABCDEF = wasm_i8x16_min(vacc0x89ABCDEF, voutput_max); + vacc1x01234567 = wasm_i8x16_min(vacc1x01234567, voutput_max); + vacc1x89ABCDEF = wasm_i8x16_min(vacc1x89ABCDEF, voutput_max); + vacc2x01234567 = wasm_i8x16_min(vacc2x01234567, voutput_max); + vacc2x89ABCDEF = wasm_i8x16_min(vacc2x89ABCDEF, voutput_max); + vacc3x01234567 = wasm_i8x16_min(vacc3x01234567, voutput_max); + vacc3x89ABCDEF = wasm_i8x16_min(vacc3x89ABCDEF, voutput_max); + + if (nc >= 16) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + wasm_v128_store64_lane(c3 + 8, vacc3x89ABCDEF, 0); + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + wasm_v128_store64_lane(c2 + 8, vacc2x89ABCDEF, 0); + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + wasm_v128_store64_lane(c1 + 8, vacc1x89ABCDEF, 0); + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + wasm_v128_store64_lane(c0 + 8, vacc0x89ABCDEF, 0); + + c3 = (int8_t*) ((uintptr_t) c3 + cn_stride); + c2 = (int8_t*) ((uintptr_t) c2 + cn_stride); + c1 = (int8_t*) ((uintptr_t) c1 + cn_stride); + c0 = (int8_t*) ((uintptr_t) c0 + cn_stride); + + a = (const int8_t**restrict) ((uintptr_t) a - ks); + + nc -= 16; + } else { + if (nc & 8) { + wasm_v128_store64_lane(c3, vacc3x01234567, 0); + c3 += 8; + wasm_v128_store64_lane(c2, vacc2x01234567, 0); + c2 += 8; + wasm_v128_store64_lane(c1, vacc1x01234567, 0); + c1 += 8; + wasm_v128_store64_lane(c0, vacc0x01234567, 0); + c0 += 8; + + vacc0x01234567 = vacc0x89ABCDEF; + vacc1x01234567 = vacc1x89ABCDEF; + vacc2x01234567 = vacc2x89ABCDEF; + vacc3x01234567 = vacc3x89ABCDEF; + } + if (nc & 4) { + wasm_v128_store32_lane(c3, vacc3x01234567, 0); + c3 += 4; + wasm_v128_store32_lane(c2, vacc2x01234567, 0); + c2 += 4; + wasm_v128_store32_lane(c1, vacc1x01234567, 0); + c1 += 4; + wasm_v128_store32_lane(c0, vacc0x01234567, 0); + c0 += 4; + + vacc0x01234567 = wasm_u64x2_shr(vacc0x01234567, 32); + vacc1x01234567 = wasm_u64x2_shr(vacc1x01234567, 32); + vacc2x01234567 = wasm_u64x2_shr(vacc2x01234567, 32); + vacc3x01234567 = wasm_u64x2_shr(vacc3x01234567, 32); + } + if (nc & 2) { + wasm_v128_store16_lane(c3, vacc3x01234567, 0); + c3 += 2; + wasm_v128_store16_lane(c2, vacc2x01234567, 0); + c2 += 2; + wasm_v128_store16_lane(c1, vacc1x01234567, 0); + c1 += 2; + wasm_v128_store16_lane(c0, vacc0x01234567, 0); + c0 += 2; + + vacc0x01234567 = wasm_u32x4_shr(vacc0x01234567, 16); + vacc1x01234567 = wasm_u32x4_shr(vacc1x01234567, 16); + vacc2x01234567 = wasm_u32x4_shr(vacc2x01234567, 16); + vacc3x01234567 = wasm_u32x4_shr(vacc3x01234567, 16); + } + if (nc & 1) { + wasm_v128_store8_lane(c3, vacc3x01234567, 0); + wasm_v128_store8_lane(c2, vacc2x01234567, 0); + wasm_v128_store8_lane(c1, vacc1x01234567, 0); + wasm_v128_store8_lane(c0, vacc0x01234567, 0); + } + + nc = 0; + } + } while (nc != 0); +} diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h index 81dbf782393..2095fc270ef 100644 --- a/src/xnnpack/gemm.h +++ b/src/xnnpack/gemm.h @@ -2719,6 +2719,12 @@ DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_u DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__wasmsdot_u2) DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__wasmsdot_u2) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot) + +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2) +DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2) + DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__scalar) DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4__scalar) DECLARE_QD8_F32_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8__scalar) @@ -3155,6 +3161,30 @@ DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_uker DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__wasmsdot_u2) DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot) + +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2) + +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2) + +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot) + +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2) + +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2) +DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2) + DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64) DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64) DECLARE_QS8_QC8W_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64) diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h index 2bf80d6af48..59c0d69f6ee 100644 --- a/src/xnnpack/igemm.h +++ b/src/xnnpack/igemm.h @@ -978,6 +978,12 @@ DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__wasmsdot_u2) DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__wasmsdot_u2) +DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot) +DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot) + +DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot_u2) +DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot_u2) + DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2__wasmsimd_dot16x2_ld64) DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2__wasmsimd_dot16x2_ld128) DECLARE_QD8_F32_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64) @@ -1484,6 +1490,30 @@ DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_uk DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x8c8__wasmsdot_u2) DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__wasmsdot_u2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot) + +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2) + +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2) + +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot) + +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2) + +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2) +DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2) + DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic) DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic) DECLARE_QS8_QC8W_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic) diff --git a/test/qd8-f32-qc8w-gemm-minmax-2.cc b/test/qd8-f32-qc8w-gemm-minmax-2.cc index 3a3f147e128..10fa5fae734 100644 --- a/test/qd8-f32-qc8w-gemm-minmax-2.cc +++ b/test/qd8-f32-qc8w-gemm-minmax-2.cc @@ -1718,6 +1718,44 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_1X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_4X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qd8-f32-qc8w-gemm-minmax-4.cc b/test/qd8-f32-qc8w-gemm-minmax-4.cc index 059cc492d2e..c282bd8041a 100644 --- a/test/qd8-f32-qc8w-gemm-minmax-4.cc +++ b/test/qd8-f32-qc8w-gemm-minmax-4.cc @@ -2318,6 +2318,25 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_1X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qd8-f32-qc8w-gemm-minmax.cc b/test/qd8-f32-qc8w-gemm-minmax.cc index 0011ba64ad2..d59f9299c97 100644 --- a/test/qd8-f32-qc8w-gemm-minmax.cc +++ b/test/qd8-f32-qc8w-gemm-minmax.cc @@ -1718,6 +1718,25 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_GEMM_MINMAX_4X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_gemm_goi_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qd8-f32-qc8w-gemm-minmax.yaml b/test/qd8-f32-qc8w-gemm-minmax.yaml index b482d0ebb82..13e249757c3 100644 --- a/test/qd8-f32-qc8w-gemm-minmax.yaml +++ b/test/qd8-f32-qc8w-gemm-minmax.yaml @@ -1102,6 +1102,23 @@ pack: xnn_pack_qs8_gemm_goi_w k-block: 8 +- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_gemm_goi_w + k-block: 8 +- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__wasmusdot_u2 + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_gemm_goi_w + k-block: 8 +- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_gemm_goi_w + k-block: 8 +- name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__wasmusdot_u2 + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_gemm_goi_w + k-block: 8 + # WAsm - name: xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x2__wasm init: xnn_init_f32_minmax_scalar_params diff --git a/test/qd8-f32-qc8w-igemm-minmax-2.cc b/test/qd8-f32-qc8w-igemm-minmax-2.cc index 8a5b368b354..7e0644069e8 100644 --- a/test/qd8-f32-qc8w-igemm-minmax-2.cc +++ b/test/qd8-f32-qc8w-igemm-minmax-2.cc @@ -1226,6 +1226,63 @@ INSTANTIATE_TEST_SUITE_P( #if XNN_ARCH_WASMRELAXEDSIMD + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_IGEMM_MINMAX_1X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/true, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_conv_goki_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_IGEMM_MINMAX_1X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/true, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot_u2, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_conv_goki_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_IGEMM_MINMAX_4X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/true, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_conv_goki_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + INSTANTIATE_TEST_SUITE_P( QD8_F32_QC8W_IGEMM_MINMAX_3X8C8__WASMUSDOT, GemmTest, testing::ValuesIn(CreateTests1( diff --git a/test/qd8-f32-qc8w-igemm-minmax-3.cc b/test/qd8-f32-qc8w-igemm-minmax-3.cc index c3e7c79afbc..35e41fac7cd 100644 --- a/test/qd8-f32-qc8w-igemm-minmax-3.cc +++ b/test/qd8-f32-qc8w-igemm-minmax-3.cc @@ -1330,6 +1330,25 @@ INSTANTIATE_TEST_SUITE_P( #if XNN_ARCH_WASMRELAXEDSIMD + INSTANTIATE_TEST_SUITE_P( + QD8_F32_QC8W_IGEMM_MINMAX_4X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/true, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot_u2, + xnn_init_f32_minmax_scalar_params, + xnn_pack_qs8_conv_goki_w); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + INSTANTIATE_TEST_SUITE_P( QD8_F32_QC8W_IGEMM_MINMAX_1X8C8__WASMUSDOT_U2, GemmTest, testing::ValuesIn(CreateTests1( diff --git a/test/qd8-f32-qc8w-igemm-minmax.yaml b/test/qd8-f32-qc8w-igemm-minmax.yaml index c609e2032a6..aeef211db39 100644 --- a/test/qd8-f32-qc8w-igemm-minmax.yaml +++ b/test/qd8-f32-qc8w-igemm-minmax.yaml @@ -652,6 +652,23 @@ k-block: 2 # WAsm Relaxed SIMD +- name: xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__wasmusdot_u2 + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x16c4__wasmusdot_u2 + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 + - name: xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__wasmusdot init: xnn_init_f32_minmax_scalar_params pack: xnn_pack_qs8_conv_goki_w diff --git a/test/qs8-qc8w-gemm-minmax-fp32-2.cc b/test/qs8-qc8w-gemm-minmax-fp32-2.cc index b51431b8095..45f04f0bb8b 100644 --- a/test/qs8-qc8w-gemm-minmax-fp32-2.cc +++ b/test/qs8-qc8w-gemm-minmax-fp32-2.cc @@ -2388,6 +2388,206 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_1X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_3X16C4__WASMUSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_3X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_3X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_4X16C4__WASMUSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_4X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_1X16C4__WASMSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_3X16C4__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_3X16C4__WASMSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_4X16C4__WASMSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qs8-qc8w-gemm-minmax-fp32-3.cc b/test/qs8-qc8w-gemm-minmax-fp32-3.cc index 60bb41ef83d..15afb111969 100644 --- a/test/qs8-qc8w-gemm-minmax-fp32-3.cc +++ b/test/qs8-qc8w-gemm-minmax-fp32-3.cc @@ -2631,6 +2631,106 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_1X16C4__WASMUSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_4X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_1X16C4__WASMSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_3X16C4__WASMSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_4X16C4__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qs8-qc8w-gemm-minmax-fp32.cc b/test/qs8-qc8w-gemm-minmax-fp32.cc index a434b087cca..e7e87e01703 100644 --- a/test/qs8-qc8w-gemm-minmax-fp32.cc +++ b/test/qs8-qc8w-gemm-minmax-fp32.cc @@ -3257,6 +3257,66 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_1X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_1X16C4__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_GEMM_MINMAX_FP32_4X16C4__WASMSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_gemm_goi_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qs8-qc8w-gemm-minmax-fp32.yaml b/test/qs8-qc8w-gemm-minmax-fp32.yaml index 81d608fa832..d1acc704c44 100644 --- a/test/qs8-qc8w-gemm-minmax-fp32.yaml +++ b/test/qs8-qc8w-gemm-minmax-fp32.yaml @@ -1510,6 +1510,79 @@ pack: xnn_pack_qs8_gemm_goi_w k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 +- name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_gemm_goi_w + k-block: 8 + # WAsm - name: xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params diff --git a/test/qs8-qc8w-igemm-minmax-fp32-2.cc b/test/qs8-qc8w-igemm-minmax-fp32-2.cc index ede7490b69c..c959044f127 100644 --- a/test/qs8-qc8w-igemm-minmax-fp32-2.cc +++ b/test/qs8-qc8w-igemm-minmax-fp32-2.cc @@ -2704,6 +2704,146 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_1X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_1X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_4X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_4X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_1X16C4__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_3X16C4__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_4X16C4__WASMSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qs8-qc8w-igemm-minmax-fp32-3.cc b/test/qs8-qc8w-igemm-minmax-fp32-3.cc index b9e35469cfc..164429183c6 100644 --- a/test/qs8-qc8w-igemm-minmax-fp32-3.cc +++ b/test/qs8-qc8w-igemm-minmax-fp32-3.cc @@ -2586,6 +2586,126 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_1X16C4__WASMUSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_3X16C4__WASMUSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_4X16C4__WASMUSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_1X16C4__WASMSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_1X16C4__WASMSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/1, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_3X16C4__WASMSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qs8-qc8w-igemm-minmax-fp32.cc b/test/qs8-qc8w-igemm-minmax-fp32.cc index cc6ce110cd8..3a217467f01 100644 --- a/test/qs8-qc8w-igemm-minmax-fp32.cc +++ b/test/qs8-qc8w-igemm-minmax-fp32.cc @@ -2783,6 +2783,106 @@ std::vector CreateTests1( [](const testing::TestParamInfo& info) { return info.param.test_name; }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_3X16C4__WASMUSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_3X16C4__WASMUSDOT_U2_ACC2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_to_qu8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_USDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_3X16C4__WASMSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/3, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_4X16C4__WASMSDOT, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + QS8_QC8W_IGEMM_MINMAX_FP32_4X16C4__WASMSDOT_U2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/8, + /*adj_k_block=*/8, + /*mr=*/4, /*nr=*/16, /*kr=*/4, /*sr=*/1, + /*is_igemm=*/false, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2, + xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params, + xnn_pack_qs8_conv_goki_w, + xnn_qs8_requantize_fp32); + }, + []() { + TEST_REQUIRES_WASM_SDOT; + })), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); #endif // XNN_ARCH_WASMRELAXEDSIMD diff --git a/test/qs8-qc8w-igemm-minmax-fp32.yaml b/test/qs8-qc8w-igemm-minmax-fp32.yaml index 8df0e04b939..c384e0f6007 100644 --- a/test/qs8-qc8w-igemm-minmax-fp32.yaml +++ b/test/qs8-qc8w-igemm-minmax-fp32.yaml @@ -1472,6 +1472,80 @@ pack: xnn_pack_qs8_conv_goki_w k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmusdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmusdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmusdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_to_qu8_conv_goki_w + k-block: 8 + +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c4__wasmsdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_3x16c4__wasmsdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 +- name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x16c4__wasmsdot_u2_acc2 + init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params + pack: xnn_pack_qs8_conv_goki_w + k-block: 8 + # WAsm - name: xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic init: xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params