Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copybara import of the project: #6811

Merged
merged 2 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions bench/qd8-f32-qc8w-gemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2626,6 +2626,39 @@

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/1, /*nr=*/4, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/1, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/1, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_1x8c16__wasmusdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot,
Expand All @@ -2637,6 +2670,39 @@

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/2, /*nr=*/4, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x4c16__wasmusdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/2, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/2, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_2x8c16__wasmusdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot,
Expand All @@ -2648,6 +2714,39 @@

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/3, /*nr=*/4, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x4c16__wasmusdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/3, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/3, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_3x8c16__wasmusdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot,
Expand All @@ -2658,6 +2757,39 @@
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/4, /*nr=*/4, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x4c16__wasmusdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/4, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmsdot)

static void qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot,
xnn_init_f32_minmax_wasmsimd_params,
xnn_pack_qs8_gemm_goi_w,
/*mr=*/4, /*nr=*/8, /*kr=*/16, /*sr=*/1,
benchmark::utils::CheckWAsmUSDOT);
}

BENCHMARK_GEMM(qd8_f32_qc8w_gemm_minmax_ukernel_4x8c16__wasmusdot)
#endif // XNN_ARCH_WASMRELAXEDSIMD


Expand Down
12 changes: 12 additions & 0 deletions cmake/gen/wasmrelaxedsimd_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -431,9 +431,21 @@ SET(ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS
src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u20.c
src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u24.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x4c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x4c16-minmax-wasmsdot.c
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-3x4c16-minmax-wasmsdot.c
Expand Down
12 changes: 12 additions & 0 deletions gen/wasmrelaxedsimd_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -427,9 +427,21 @@ ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS = [
"src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u20.c",
"src/f32-vsigmoid/gen/f32-vsigmoid-wasmrelaxedsimd-rr2-p5-div-u24.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c",
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-1x4c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-2x4c16-minmax-wasmsdot.c",
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-3x4c16-minmax-wasmsdot.c",
Expand Down
15 changes: 15 additions & 0 deletions scripts/generate-qs8-gemm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,11 @@ tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=4 -D REQUANTIZATION=
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmsdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmsdot.c &

tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmsdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmsdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmsdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=1 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmsdot.c &

tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=4 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x4c16-minmax-fp32-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=4 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-2x4c16-minmax-fp32-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=4 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x4c16-minmax-fp32-wasmusdot.c &
Expand All @@ -281,6 +286,16 @@ tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=8 -D REQUANTIZATION=FP3
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=8 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-3x8c16-minmax-fp32-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=8 -D REQUANTIZATION=FP32 -D DATATYPE=QC8 -D SDOT=0 -o src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x8c16-minmax-fp32-wasmusdot.c &

tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x4c16-minmax-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x4c16-minmax-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=4 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x4c16-minmax-wasmusdot.c &

tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=1 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c16-minmax-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=2 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8c16-minmax-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=3 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8c16-minmax-wasmusdot.c &
tools/xngen src/qs8-gemm/MRx4c16-wasmdot.c.in -D MR=4 NR=8 -D REQUANTIZATION= -D DATATYPE=QD8 -D SDOT=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8c16-minmax-wasmusdot.c &

################################### ARM NEON ##################################
tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION= -D DATATYPE=QD8 -D ARMV8=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8-minmax-neon-mlal-lane.c &
tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION= -D DATATYPE=QD8 -D ARMV8=0 -o src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8-minmax-neon-mlal-lane.c &
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,12 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmsdot(
vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0);

const v128_t vfilter_output_scale0123 = wasm_v128_load(w);
vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123);
w = (const float*) w + 4;
vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123);

const v128_t vbias0123 = wasm_v128_load(w);
vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123);
w = (const float*) w + 4;
vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123);

const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min);
vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin);
Expand Down
123 changes: 123 additions & 0 deletions src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x4c16-minmax-wasmusdot.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// Auto-generated file. Do not edit!
// Template: src/qs8-gemm/MRx4c16-wasmdot.c.in
// Generator: tools/xngen
//
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <wasm_simd128.h>

#include "xnnpack/gemm.h"
#include "xnnpack/math.h"

void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c16__wasmusdot(
size_t mr,
size_t nc,
size_t kc,
const int8_t* restrict a,
size_t a_stride,
const void* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)],
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);

kc = round_up_po2(kc, 16 * sizeof(int8_t));
const int8_t* a0 = a;
float* c0 = c;

const v128_t vsign_mask = wasm_u8x16_const_splat(UINT8_C(0x80));
do {
v128_t vksum0 = wasm_v128_load32_zero(w);
v128_t vksum1 = wasm_v128_load32_zero((const int32_t*) w + 1);
v128_t vksum2 = wasm_v128_load32_zero((const int32_t*) w + 2);
v128_t vksum3 = wasm_v128_load32_zero((const int32_t*) w + 3);
const v128_t vinput_zero_point0 = wasm_i32x4_splat((int32_t) quantization_params[0].zero_point + 128);
v128_t vacc0x0 = wasm_i32x4_mul(vksum0, vinput_zero_point0);
v128_t vacc0x1 = wasm_i32x4_mul(vksum1, vinput_zero_point0);
v128_t vacc0x2 = wasm_i32x4_mul(vksum2, vinput_zero_point0);
v128_t vacc0x3 = wasm_i32x4_mul(vksum3, vinput_zero_point0);
w = (const int32_t*) w + 4;

size_t k = kc;
do {
const v128_t va0 = wasm_v128_xor(wasm_v128_load(a0), vsign_mask);
a0 += 16;

const v128_t vb0 = wasm_v128_load(w);

vacc0x0 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb0, va0, vacc0x0);
const v128_t vb1 = wasm_v128_load((const int8_t*) w + 16);

vacc0x1 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb1, va0, vacc0x1);
const v128_t vb2 = wasm_v128_load((const int8_t*) w + 32);

vacc0x2 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb2, va0, vacc0x2);
const v128_t vb3 = wasm_v128_load((const int8_t*) w + 48);

vacc0x3 = wasm_i32x4_relaxed_dot_i8x16_i7x16_add(vb3, va0, vacc0x3);

w = (const int8_t*) w + 64;
k -= 16 * sizeof(int8_t);
} while (k != 0);

const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7));
const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7));

v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7));

vacc0x0123 = wasm_f32x4_convert_i32x4(vacc0x0123);

const v128_t vinput_scale0 = wasm_v128_load32_splat(&quantization_params[0].inv_scale);

vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vinput_scale0);

const v128_t vfilter_output_scale0123 = wasm_v128_load(w);
w = (const float*) w + 4;
vacc0x0123 = wasm_f32x4_mul(vacc0x0123, vfilter_output_scale0123);

const v128_t vbias0123 = wasm_v128_load(w);
w = (const float*) w + 4;
vacc0x0123 = wasm_f32x4_add(vacc0x0123, vbias0123);

const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min);
vacc0x0123 = wasm_f32x4_pmax(vacc0x0123, vmin);

const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max);
vacc0x0123 = wasm_f32x4_pmin(vacc0x0123, vmax);

if XNN_LIKELY(nc >= 4) {
wasm_v128_store(c0, vacc0x0123);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);

c0 = (float*) ((uintptr_t) c0 + cn_stride);

nc -= 4;
} else {
if (nc & 2) {
wasm_v128_store64_lane(c0, vacc0x0123, 0);
vacc0x0123 = wasm_v64x2_shuffle(vacc0x0123, vacc0x0123, 1, 1);
c0 += 2;
}
if (nc & 1) {
wasm_v128_store32_lane(c0, vacc0x0123, 0);
}
nc = 0;
}
} while (nc != 0);
}
Loading
Loading